library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.1     v purrr   0.3.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyr)
#library(rapportools)

Loading the train data

# Load train data - Replace blank cells with NA
df_x_main <- read.csv('airbnb_train_x.csv', na.strings = c("", "NA"))
df_y <- read.csv('airbnb_train_y.csv', na.strings = c("", "NA"))
df_x <- df_x_main

Loading the test data

tdf_x_main <- read.csv('airbnb_test_x.csv', na.strings = c("", "NA"))
tdf <- tdf_x_main

Check for missingness

# Check for missingness
colSums(is.na(df_x))
##                                X                           access 
##                                0                            34436 
##                     accommodates                        amenities 
##                                0                                0 
##                  availability_30                 availability_365 
##                                0                                0 
##                  availability_60                  availability_90 
##                                0                                0 
##                        bathrooms                         bed_type 
##                              246                                1 
##                         bedrooms                             beds 
##                               93                               83 
##              cancellation_policy                             city 
##                                0                               49 
##                        city_name                     cleaning_fee 
##                                1                            18324 
##                          country                     country_code 
##                                1                                7 
##                      description              experiences_offered 
##                               27                                3 
##                     extra_people                     first_review 
##                                0                                8 
##                  guests_included                       host_about 
##                                0                            30812 
##             host_acceptance_rate             host_has_profile_pic 
##                            91758                              142 
##           host_identity_verified                host_is_superhost 
##                              148                              142 
##              host_listings_count                    host_location 
##                              145                              411 
##                        host_name               host_neighbourhood 
##                              147                            17233 
##               host_response_rate               host_response_time 
##                            15794                            15794 
##                       host_since        host_total_listings_count 
##                              142                              142 
##               host_verifications                      house_rules 
##                               10                            30393 
##                 instant_bookable                      interaction 
##                               12                            35880 
##         is_business_travel_ready                is_location_exact 
##                            44551                               19 
##               jurisdiction_names                         latitude 
##                            52690                               10 
##                          license                        longitude 
##                            85721                               11 
##                           market                   maximum_nights 
##                              433                               10 
##                   minimum_nights                    monthly_price 
##                               10                            79436 
##                             name            neighborhood_overview 
##                               33                            30743 
##                    neighbourhood                            notes 
##                            13553                            48533 
##                            price                    property_type 
##                               19                               20 
## require_guest_phone_verification    require_guest_profile_picture 
##                               19                               19 
##                 requires_license                        room_type 
##                               19                               19 
##                 security_deposit                   smart_location 
##                            40648                               20 
##                            space                      square_feet 
##                            21045                            98419 
##                            state                           street 
##                               22                               19 
##                          summary                          transit 
##                             2964                            29167 
##                     weekly_price                          zipcode 
##                            75895                             1049

There are a lot of columns with missing data but it’s easier to understand using proportions

missing.values <- df_x_main %>%
  gather(key = "key", value = "val") %>%
  mutate(isna = is.na(val)) %>%
  group_by(key) %>%
  mutate(total = n()) %>%
  group_by(key, total, isna) %>%
  summarise(num.isna = n()) %>%
  mutate(pct = num.isna / total * 100)
## Warning: attributes are not identical across measure variables;
## they will be dropped
levels <-
    (missing.values  %>% filter(isna == T) %>% arrange(desc(pct)))$key

percentage.plot <- missing.values %>%
      ggplot() +
        geom_bar(aes(x = reorder(key, desc(pct)),
                     y = pct, fill=isna),
                 stat = 'identity', alpha=0.8) +
      scale_x_discrete(limits = levels) +
      scale_fill_manual(name = "",
                        values = c('steelblue', 'tomato3'), labels = c("Present", "Missing")) +
      coord_flip() +
      labs(title = "Percentage of missing values", x =
             'Variable', y = "% of missing values")

percentage.plot
## Warning: Removed 10 rows containing missing values (position_stack).

We can also see how the rows are -

row.plot <- df_x %>%
  mutate(id = row_number()) %>%
  gather(-id, key = "key", value = "val") %>%
  mutate(isna = is.na(val)) %>%
  ggplot(aes(key, id, fill = isna)) +
    geom_raster(alpha=0.8) +
    scale_fill_manual(name = "",
        values = c('steelblue', 'tomato3'),
        labels = c("Present", "Missing")) +
    scale_x_discrete(limits = levels) +
    labs(x = "Variable",
           y = "Row Number", title = "Missing values in rows") +
    coord_flip()
## Warning: attributes are not identical across measure variables;
## they will be dropped
row.plot
## Warning: Removed 1000000 rows containing missing values (geom_raster).

In dealing with the missingness, let’s first create missingness indicators for all the variables. For this, we can define a function. We shall use this once we are sure we’ve made a fair amount of cleaning to most of the columns.

appendNAs <- function(dataset, cols) {
  append_these = data.frame( is.na(dataset[, cols] ))
  names(append_these) = paste(names(append_these), "NA", sep = "_")
  dataset = cbind(dataset, append_these)
  dataset[is.na(dataset)] = -1
  return(dataset)
}

A glimpse at the structure of the data

glimpse(df_x)
## Rows: 100,000
## Columns: 70
## $ X                                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,...
## $ access                           <fct> "I like for our guests to feel rig...
## $ accommodates                     <fct> 12, 1, 10, 4, 4, 3, 6, 1, 2, 2, 1,...
## $ amenities                        <fct> "{TV,\"Cable TV\",Internet,\"Wirel...
## $ availability_30                  <fct> 16, 30, 1, 13, 14, 3, 4, 29, 2, 0,...
## $ availability_365                 <int> 305, 365, 330, 150, 39, 3, 131, 36...
## $ availability_60                  <fct> 33.0, 60.0, 25.0, 32.0, 27.0, 3.0,...
## $ availability_90                  <fct> 57, 90, 55, 62, 39, 3, 44, 87, 62,...
## $ bathrooms                        <fct> 2.5, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0,...
## $ bed_type                         <fct> Real Bed, Real Bed, Real Bed, Real...
## $ bedrooms                         <fct> 4.0, 1.0, 4.0, 1.0, 2.0, 1.0, 2.0,...
## $ beds                             <fct> 6.0, 1.0, 6.0, 2.0, 2.0, 2.0, 3.0,...
## $ cancellation_policy              <fct> strict, flexible, strict, moderate...
## $ city                             <fct> Nashville, Los Angeles, San Diego,...
## $ city_name                        <fct> Nashville, Los Angeles, San Diego,...
## $ cleaning_fee                     <fct> $250.00, NA, $50.00, $45.00, $75.0...
## $ country                          <fct> United States, United States, Unit...
## $ country_code                     <fct> US, US, US, US, US, US, US, US, US...
## $ description                      <fct> "Vacation in this Nashville home f...
## $ experiences_offered              <fct> none, none, none, none, none, none...
## $ extra_people                     <fct> $0.00, $0.00, $0.00, $20.00, $50.0...
## $ first_review                     <fct> 2016-12-18, 2015-07-17, 2016-05-08...
## $ guests_included                  <dbl> 8, 1, 1, 2, 4, 2, 5, 1, 1, 1, 1, 1...
## $ host_about                       <fct> NA, NA, "I am from Virginia, USA a...
## $ host_acceptance_rate             <fct> NA, NA, 100%, NA, NA, NA, NA, NA, ...
## $ host_has_profile_pic             <fct> t, t, t, t, t, t, t, t, t, t, t, t...
## $ host_identity_verified           <fct> f, t, t, t, t, t, t, t, t, t, f, t...
## $ host_is_superhost                <fct> t, f, f, t, t, t, t, f, f, f, f, t...
## $ host_listings_count              <fct> 1.0, 1.0, 14.0, 1.0, 1.0, 5.0, 7.0...
## $ host_location                    <fct> "Nashville, Tennessee, United Stat...
## $ host_name                        <fct> May & Eric, Jonathon, Jeremy, Gene...
## $ host_neighbourhood               <fct> NA, Silver Lake, East Village, Nea...
## $ host_response_rate               <fct> 100%, NA, 100%, 100%, 100%, 100%, ...
## $ host_response_time               <fct> within an hour, NA, within an hour...
## $ host_since                       <fct> 2015-09-03, 2015-07-07, 2013-07-20...
## $ host_total_listings_count        <fct> 1.0, 1.0, 14.0, 1.0, 1.0, 5.0, 7.0...
## $ host_verifications               <fct> "['email', 'phone', 'google', 'rev...
## $ house_rules                      <fct> NA, NA, "No pets, parties, or indo...
## $ instant_bookable                 <fct> t, f, t, t, f, f, f, f, f, f, t, f...
## $ interaction                      <fct> "Don't hesitate to call or text an...
## $ is_business_travel_ready         <fct> t, NA, NA, NA, f, t, t, f, f, f, f...
## $ is_location_exact                <fct> f, t, t, t, t, t, t, t, t, t, t, t...
## $ jurisdiction_names               <fct> NA, "City of Los Angeles, CA", "SA...
## $ latitude                         <fct> 36.1467563677, 34.0842597547, 32.7...
## $ license                          <fct> NA, NA, NA, NA, 17STR-04969, 10759...
## $ longitude                        <fct> -86.81720471780001, -118.273509113...
## $ market                           <fct> Nashville, Los Angeles, San Diego,...
## $ maximum_nights                   <dbl> 365, 1125, 1125, 1125, 1125, 10, 1...
## $ minimum_nights                   <int> 2, 1, 3, 2, 2, 2, 3, 2, 2, 3, 1, 3...
## $ monthly_price                    <fct> NA, NA, NA, NA, "$2,250.00", NA, N...
## $ name                             <fct> "BRAND NEW ! NASHVILLE LOFT", "Sil...
## $ neighborhood_overview            <fct> "This modern loft is locate in a n...
## $ neighbourhood                    <fct> NA, Silver Lake, Pacific Beach, Ne...
## $ notes                            <fct> NA, NA, "Its a fun beach town - th...
## $ price                            <fct> $375.00, $175.00, $499.00, $145.00...
## $ property_type                    <fct> Loft, House, House, House, Apartme...
## $ require_guest_phone_verification <fct> f, f, f, f, f, f, f, f, f, f, f, f...
## $ require_guest_profile_picture    <fct> f, f, f, f, f, f, f, f, f, f, f, f...
## $ requires_license                 <fct> f, f, f, f, t, t, t, f, f, f, f, f...
## $ room_type                        <fct> Entire home/apt, Private room, Ent...
## $ security_deposit                 <fct> "$500.00", NA, NA, NA, "$150.00", ...
## $ smart_location                   <fct> "Nashville, TN", "Los Angeles, CA"...
## $ space                            <fct> "Beautiful brand new loft in a cen...
## $ square_feet                      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ state                            <fct> TN, CA, CA, DC, LA, CA, LA, NY, NY...
## $ street                           <fct> "Nashville, TN, United States", "S...
## $ summary                          <fct> "Vacation in this Nashville home f...
## $ transit                          <fct> NA, NA, "Ubers are everywhere. We ...
## $ weekly_price                     <fct> NA, NA, NA, NA, $700.00, NA, NA, $...
## $ zipcode                          <fct> 37203, 90026, 92109.0, 20002, 7011...

Inspect the columns that have numeric data -

1 accommodates Factor 2 availability_30 Factor 3 availability_365 int 4 availability_60 Factor 5 availability_90 Factor 6 bathrooms Factor 7 bedrooms Factor 8 beds Factor 9 cleaning_fee Factor - (needs to be cleaned - has $ symbol like $250.00) 10 extra_people Factor - (needs to be cleaned - has $ symbol like $250.00) 11 guests_included num 12 host_acceptance_rate Factor - (needs to be cleaned - has format like so - 100%) 13 host_listings_count Factor 14 host_response_rate Factor - (needs to be cleaned - has format like so - 100%) 15 host_total_listings_count Factor 16 latitude Factor 17 longitude Factor 18 maximum_nights num 19 minimum_nights int 20 monthly_price Factor - (needs to be cleaned - has $ symbol like $250.00) 21 price Factor - (needs to be cleaned - has $ symbol like $250.00) 22 security_deposit Factor - (needs to be cleaned - has $ symbol like $250.00) 23 square_feet int 24 weekly_price Factor - (needs to be cleaned - has $ symbol like $250.00)

First let’s explore the columns with the $ symbols and commas and percentages -

dollar_num_cols = c("cleaning_fee", "extra_people", "monthly_price", "price", "security_deposit", "weekly_price")
perc_num_cols = c("host_acceptance_rate", "host_response_rate")

# The columns which have prices
qplot(cleaning_fee, data = df_x) + coord_flip()

qplot(extra_people, data = df_x) + coord_flip()

qplot(monthly_price, data = df_x) + coord_flip()

qplot(price, data = df_x) + coord_flip()

qplot(security_deposit, data = df_x) + coord_flip()

qplot(weekly_price, data = df_x) + coord_flip()

# Observations

# cleaning_fee
# - Majority of NAs
# - Has a few f and t

# extra_people
# - has some longitude/latitude information -
# - 40.7115296729, 37.7172817854, 34.0550373047, 34.0544381605, 34.04703657, 34.0305066944, 33.9847962097, 33.7751358179, 30.3646152573

# price
# -looks fine except for a very small number of NAs

# security_deposit
# mostly NAs
# second most popular category is $0.00

# weekly_price
# - looks fine
# - mostly NAs

# monthly_price
# - NA is the most popular - there are lots
# - other values are mostly 4 digits or more with very few 3 digits

# The columns which have %
qplot(host_acceptance_rate, data = df_x) + coord_flip()

qplot(host_response_rate, data = df_x) + coord_flip()

# host_acceptance_rate
# - most popular category - NA
# - all others are within format

# host_response_rate
# - 100% is most popular
# - Next is NA
# - There are 4 out of place categories - Apartment, Loft, House, Condominium

Storing the discrepancies in a variable for later use -

extra_people_discrepancies = c(40.7115296729, 37.7172817854, 34.0550373047, 34.0544381605, 34.04703657, 34.0305066944, 33.9847962097, 33.7751358179, 30.3646152573)
hostresponserate_discrepancies = c("Apartment", "Loft", "House", "Condominium")
df_x[df_x$extra_people %in% extra_people_discrepancies,]
##           X access accommodates amenities availability_30 availability_365
## 16246 16246   <NA>            t         t               f                1
## 30584 30584   <NA>            t         t               t                2
## 47615 47615   <NA>            t         t               f                1
## 56281 56281   <NA>            t         t               t                2
## 65792 65792   <NA>            t         f               f                1
## 72540 72540   <NA>            t         t               f                5
## 75208 75208   <NA>            t         t               t                2
## 92585 92585   <NA>            t         t               t                1
## 96068 96068   <NA>            t         t               f                1
##                                availability_60 availability_90
## 16246    Long Beach, California, United States          Manuel
## 30584  Santa Monica, California, United States            Medi
## 47615        New York, New York, United States           David
## 56281   Los Angeles, California, United States       Sharlette
## 65792 San Francisco, California, United States           Winni
## 72540   Los Angeles, California, United States            Ryan
## 75208   Los Angeles, California, United States       Sharlette
## 92585             Austin, Texas, United States          Amanda
## 96068   Los Angeles, California, United States             Moe
##               bathrooms bed_type       bedrooms       beds cancellation_policy
## 16246        Long Beach     100% within an hour 2016-07-18                 1.0
## 30584           Del Rey     100% within an hour 2011-07-18                 2.0
## 47615      Williamsburg     100%   within a day 2011-09-21                 1.0
## 56281      Mid-Wilshire     100% within an hour 2014-12-26                 2.0
## 65792         Oceanview     <NA>           <NA> 2015-06-25                 1.0
## 72540          Downtown      81% within an hour 2013-01-20                 5.0
## 75208      Mid-Wilshire     100% within an hour 2014-12-26                 2.0
## 92585 North Shoal Creek     100% within an hour 2014-09-12                 1.0
## 96068              <NA>     100% within an hour 2015-06-30                 1.0
##                                                                      city
## 16246 ['email', 'phone', 'facebook', 'reviews', 'jumio', 'government_id']
## 30584                    ['email', 'phone', 'facebook', 'reviews', 'kba']
## 47615                  ['email', 'phone', 'facebook', 'reviews', 'jumio']
## 56281                      ['email', 'phone', 'google', 'reviews', 'kba']
## 65792                                       ['email', 'phone', 'reviews']
## 72540                ['email', 'phone', 'reviews', 'jumio', 'work_email']
## 75208                      ['email', 'phone', 'google', 'reviews', 'kba']
## 92585                                ['email', 'phone', 'reviews', 'kba']
## 96068                              ['email', 'phone', 'reviews', 'jumio']
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      city_name
## 16246                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  - Keep kitchen, kitchenware and bathroom clean at all times with no exceptions. - Kitchen is strictly close from 10:00 pm to 6:00 am. - No smoking in the building. - No unregistered overnight guest. - There is a friendly dog in my home; so please be pet friendly.
## 30584                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           No smoking and no pets please.
## 47615                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <NA>
## 56281                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             We hope you are a dog lover. We have two Goldendoodle puppies who love humans. Quiet hours are from 10pm-7am
## 65792                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                No loud noise after 9pm. Guests are responsible for emptying the garbage during the stay.
## 72540 NOTE: we've been having a lot of people book with the intentions of using our home for photoshoots, video, parties corporate events without telling us. Please do not use instant book for a shoot or party without email first, doing so will result in your forfeiting the rental fee or deposit. Our rates are completely different for this.  Please be upfront with what you're looking to do and we will be very cool! If we do find that you have broken these rules, and we absolutely will,  you will be asked to leave and we will not refund your stay Thanks so much :)  -no additional people in the unit than the number you booked for any reason.  -new policy for the building, please no roof usage.  -Please no parties, as this is a residential building. There is a $5000 fee for unauthorized parties in our building.  -The rooftop is accessible via the bedroom, be aware that the trap door takes a bit of muscle to open. Walking on the roof above the neighbors' lofts is strictly prohibited. If the fore
## 75208                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     We hope you are a dog lover. We have two large Goldendoodles who love humans. Quiet hours are 10 pm- 7am. Thank you.
## 92585                                                                                                                             â\200¢ Check-out time is 12 pm and check-in time is 2PM or later. I'm very sensitive to cigarette smoke, so please no smokers. If you must smoke, please do so in the front yard and change your shirt when you come inside. Thank you for your understanding. Please don't feed my animals during your stay unless I'm with you. Also, please don't teach my parrot any naughty words, and don't touch or handle him unless I am present. This is for your safety.  While I love animals, I can't accept any animals in my home. This is to both keep my animals safe, and to keep my home hair free (I'm allergic to dogs and many of my guests and students are allergic to cats.) Although if you want to bring your beta fish, that's fine (yes, I did get that request).  Please clean up after yourself when you use the common areas during your stay (kitchen, living room, bathrooms, porch, studio).
## 96068                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          This listing can accommodate 2 adults, extra fee after 2 people. No smoking, No party after 10 pm. I love my neighbors. I would like my guests to respect them.
##       cleaning_fee
## 16246            f
## 30584            f
## 47615            f
## 56281            f
## 65792            f
## 72540            f
## 75208            f
## 92585            f
## 96068            t
##                                                                                                                                                                                                                                                                                                                                                                        country
## 16246                                                                                                                                                                                                                                                                                           Basically, I am out most of the day because I am working and attending school.
## 30584                                                                                                                                   Free free to contact us anytime! We'd love to show you around if you're not familiar with the area. We'll be glad to give recommendations and we'll most likely be in town (just a few minutes away in Santa Monica) during your stay.
## 47615                                                                                                                                                                                                                                                                                  We will not be on premises however in the city. We will be on call if any issues arise.
## 56281                                                                                                                                                                                                                                                                                                                                                                     <NA>
## 65792                                                                                                                                                                                                                                                                                                                        å\217¯ä»£å®šç§Ÿè½¦,购买门票æ\210–旅行其他咨询
## 72540 We will be here to show you around when you arrive but the place is yours unless we get noise complaints. There is very little noise insulation in the building, so on the main floor of the loft you must keep noise to a minimum. The upstairs floor is more insulated so you can hang out, watch movies or play video games in the theater if you plan to be up late.
## 75208                                                                                                                                                                         There are a lot of good restaurants around the area, things to explore... I can also give recommendations if you're new to the area! Ask away and I'll help you maximize your time and have fun!
## 92585                                                                                                                                                                                                 I'm an oil painter who works from home so I will usually be available for my guests when needed, but I also respect your privacy and try to stay quietly out of the way.
## 96068                                                                                                                                                                                                                          I will meet you upon arrival, and I am available to chat whenever you need me. On your departure you will meet me or a trusted friend/neighbor.
##       country_code description     experiences_offered  extra_people
## 16246         <NA>           t                    <NA> 33.7751358179
## 30584         <NA>           t City of Los Angeles, CA 33.9847962097
## 47615            f           t                    <NA> 40.7115296729
## 56281         <NA>           t City of Los Angeles, CA 34.0550373047
## 65792            f           t       {"SAN FRANCISCO"} 37.7172817854
## 72540         <NA>           t City of Los Angeles, CA 34.0305066944
## 75208         <NA>           t City of Los Angeles, CA 34.0544381605
## 92585         <NA>           t                    <NA> 30.3646152573
## 96068         <NA>           f City of Los Angeles, CA   34.04703657
##       first_review guests_included    host_about host_acceptance_rate
## 16246         <NA>      -118.18485   Los Angeles                 1125
## 30584         <NA>      -118.41256   Los Angeles                   99
## 47615         <NA>       -73.96088      New York                 1125
## 56281         <NA>      -118.29951   Los Angeles                   60
## 65792  STR-0000408      -122.45269 San Francisco                   28
## 72540         <NA>      -118.24847   Los Angeles                 1125
## 75208         <NA>      -118.29775   Los Angeles                 1125
## 92585         <NA>       -97.73292        Austin                 1125
## 96068         <NA>      -118.46222   Los Angeles                  120
##       host_has_profile_pic host_identity_verified
## 16246                    2                   <NA>
## 30584                   22              $2,960.00
## 47615                    2                   <NA>
## 56281                    2                   <NA>
## 65792                    3                   <NA>
## 72540                    1              $8,500.00
## 75208                    2                   <NA>
## 92585                    2                   <NA>
## 96068                    3              $3,200.00
##                                    host_is_superhost
## 16246  Modern, intimate condo in Downtown Long Beach
## 30584            Amazing Retreat w/ Jacuzzi and Pool
## 47615     Brooklyn Style Williamsburg 2 BR Apartment
## 56281                          Koreatown sunlit room
## 65792 Rooms with private entrance 独立ä¸\200æ\210¿ä¸\200厅
## 72540     HUGE 3,500 sq/ft Downtown Los Angeles Loft
## 75208               Private Koreatown Master Bedroom
## 92585            ART, ANIMALS & NATURE! Private room
## 96068               Penthouse Oceanview/UCLA/Parking
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                host_listings_count
## 16246                   Long Beach Downtown (LBDT) is an amazing place to stay, a few blocks away from the beach, Shoreline Village, Pike, The Queen Mary and connected with all the main attractions of Los Angeles such as Hollywood, Universal Studios, Disneyland, Knott's Berry Farm, LA Live, Staple Center, Dodgers Stadium, Angels Stadium through shuttles, train, and buses if you are coming for entertainment. On the other hand if you are going to stay for business or professional studies, LBDT is close to CSULB as well to both campuses of Long Beach Community College, the port of Long Beach, Long Beach Convention Center.
## 30584 A few minutes away are Marina Del Rey (where you can go yachting, kayaking, fine dining, etc), Playa Del Rey beach, and the only beach you can enjoy bonfires in LA- Dockweiler. The Runway shopping mall just opened.. Restaurants, Whole foods market, shopping, parks, and entertainment are only a mile away! The place is conveniently located with easy commutes to the South Bay citites and the rest of LA!   Bike paths are steps away where you could easily bike through the wetlands and to the beach! Plus, there are two reserved parking spaces in addition to many more parking always available for guests.  Truly a gem :)
## 47615                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       We are surrounded by many independent coffee shops, fun bars and a range of different lively restaurants with many different cuisines.
## 56281                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <NA>
## 65792                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <NA>
## 72540                                                                                                                                                                                                                                                                                                                                                                                       Located on the edge of the Fashion District, we are on quite possibly the quietest block in Downtown Los Angeles. The loft is about a $4 Uber ride from the city center where you will find thousands of restaurants and bars within walking distance.
## 75208                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <NA>
## 92585                                                                                                                                                                                                                                                                                                                                                        My neighborhood is closely located to the Domain and the Arboretum area with loads of shopping and dining options. I'm also closely located to two Whole Foods Markets. I'm within walking distance of many dining and entertainment options, including The Alamo Drafthouse Village.
## 96068                                                                                                                                                                                                                                                                                                                                                                                                                                     Some of the best restaurants of Los Angeles is in walking distance. Chinese, Persian, Thai, Vietnamese, Berger, Organic, Vegan, Vegeterian, Mexican and more. There are few bars walking distance to me.
##           host_location
## 16246        Long Beach
## 30584           Del Rey
## 47615      Williamsburg
## 56281      Mid-Wilshire
## 65792         Oceanview
## 72540          Downtown
## 75208      Mid-Wilshire
## 92585 North Shoal Creek
## 96068              <NA>
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                             host_name
## 16246                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
## 30584                                                          Speedy internet connection, with cable and Netflix on HD tv!  Appliances and cookware include: coffee maker, microwave, stainless steel cookware, coffee maker, toaster, pots, water boiler and knives set. For your convenience: iron, iron board, hair dryer, extra beach/pool towels, crib, pillows, sheets, plush queen air mattress, and spacious walk-in closet with more than enough room to put your things :)
## 47615                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
## 56281                                                                                                                                                                                                                                                                                There are a lot of good restaurants around the area, things to explore... I can also give recommendations if you're new to the area! Ask away and I'll help you maximize your time and have fun!
## 65792                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
## 72540 Take a one mile car ride to some of the finest dining and cocktails on the west coast, or keep it über local. Daniel Benhaim, internationally renowned chef, who personally sources his meat and produce from local organic farms, throws private dinner parties next door 3 or 4 times a month. Look up his menu and calendar by googling his name and "because the wind." It is a magical evening where the beautiful people show their faces and eat Daniel's decadent art.
## 75208                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
## 92585      I teach from my home studio on Tuesday, Wednesday and Friday evenings and Saturdays during the day, but this will not interfere with your living space. On most Friday evenings from 8-10 pm I host an art club meeting which will sometimes involve using the living room to watch a movie. I have a boyfriend who travels a lot so he usually isn't here, but sometimes he is. He is very friendly and respectful.  Check out time is 12:00 pm, and check in is 2:00 pm.
## 96068                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <NA>
##       host_neighbourhood host_response_rate host_response_time host_since
## 16246             $63.00        Condominium                  f          f
## 30584            $153.00          Apartment                  f          f
## 47615            $195.00          Apartment                  f          f
## 56281             $65.00          Apartment                  f          f
## 65792            $195.00              House                  f          f
## 72540            $445.00               Loft                  f          f
## 75208             $75.00          Apartment                  f          f
## 92585             $40.00              House                  f          f
## 96068            $113.00          Apartment                  f          f
##       host_total_listings_count host_verifications     house_rules
## 16246                         f              100.0    Private room
## 30584                         f               90.0 Entire home/apt
## 47615                         f               80.0 Entire home/apt
## 56281                         f               90.0    Private room
## 65792                         t               98.0    Private room
## 72540                         f               90.0 Entire home/apt
## 75208                         f              100.0    Private room
## 92585                         f               97.0    Private room
## 96068                         f               88.0 Entire home/apt
##       instant_bookable          interaction
## 16246             <NA>       Long Beach, CA
## 30584          $250.00      Los Angeles, CA
## 47615          $250.00         Brooklyn, NY
## 56281          $100.00      Los Angeles, CA
## 65792          $500.00    San Francisco, CA
## 72540             <NA>      Los Angeles, CA
## 75208          $100.00      Los Angeles, CA
## 92585          $100.00           Austin, TX
## 96068          $200.00 West Los Angeles, CA
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       is_business_travel_ready
## 16246                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Roomie cozy bedroom (195 square ft) with a queen size bed, tv, a chair, AC, heater and coin laundry in the basement of the building.
## 30584                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Completely renovated!  800 Sq feet -Luxurious jacuzzi and marble stand-alone shower -Hardwood flooring -Marble flooring in kitchen and bathroom -2 parking spaces (not tandem) -Stainless steel fridge -Dishwasher -All ceiling lights have dimmer switches -Gorgeous hardwood ceilings -Triple paned windows and balcony doors  -Balcony looks out to the pool -Double lounger for relaxing on the balcony -Internet and Wifi -Cable and Netflix on HD tv -Comfy bed and mattress  -We provide bedding and towels
## 47615                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                We are happy to share our 2 bedroom apartment that we call home. There are two good sized bedrooms both with Queen beds and sunlight throughout the day. There is a modern kitchen, large flat screen television with Verizon Cable/Apple TV, wifi internet, modern bathroom and a  large communal rooftop with breathtaking Manhattan Skyline views. Also decorated by original artwork from our friend @lucasgeorpaints
## 56281                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <NA>
## 65792                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Close to Bart, easy to get to downtown
## 72540                                                                                                                                                                                                                                                                                                                                                                                                           It's a breathtaking space. Original hardwood floors and exposed brick, soaring 20' ceilings, a floating staircase, mid-century art, a beautiful reclaimed wood dining table, geometric metal sculptures, sequoia stump side tables, a CA King size Tempurpedic bed, and a trap door from the bedroom leading to a rooftop view worthy of Apple's recent iPhone 6 commercial, which was filmed here.  The industrial neighborhood is quiet and empty, though a bit dirty, but everything cute in DTLA is only about 1 mile away, which makes ours the best view of the skyline. We also have a 300 megabit internet connection.
## 75208                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    The furniture is brand new and thoughtfully planned for this master bedroom.  I've purchased a new queen sized memory foam mattress for your comfort.
## 92585 This charming home is located in the friendly neighborhood of North Shoal Creek is perfect for anyone who loves art, animals and nature. I have two happy, free-ranging chickens (Heraldine and Ruby) who lay fresh organic and soy-free eggs daily for your consumption. There are two outdoor snugly rabbits (Darwin and Pearl) who will gladly accept your ear rubs. And the king of the house is my adorable African Grey parrot, Jack. He may or may not let you scratch his head, but he is certain to entertain with his large vocabulary and sense of humor. My backyard is lush and green most of the year, and the lily pond is gorgeous when in bloom, which attracts a host of native birds and wildlife. Don't worry about those Texas mosquitoes, because I have a lovely screened in porch complete with comfy furniture and a hammock for you to nap or lounge in while safe from pesky insects.  My kitchen is fully stocked with everything you need to make a meal, including a bread machine, Super Angel Juicer, cr
## 96068                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             The space is a large studio apartment with hip to ceiling windows. Tow sides are windows and two walls are mirrored. Sun enters the house as she rises and leaves the house as it sets. The views are spectacular! Views of the ocean and pool from one window and the Hill's and downtown Beverly Hills from the other.  Ocean breeze comes in the house if you open one window on the ocean side and the other one in the kitchen. That feels you are in Heaven. Welcome to the Penthouse!
##       is_location_exact jurisdiction_names
## 16246              <NA>                 CA
## 30584              <NA>                 CA
## 47615              <NA>                 NY
## 56281              <NA>                 CA
## 65792              <NA>                 CA
## 72540              <NA>                 CA
## 75208              <NA>                 CA
## 92585              <NA>                 TX
## 96068              <NA>                 CA
##                                                 latitude
## 16246    Long Beach, Long Beach, CA 90813, United States
## 30584      Del Rey, Los Angeles, CA 90066, United States
## 47615                        Brooklyn, NY, United States
## 56281 Mid-Wilshire, Los Angeles, CA 90006, United States
## 65792                   San Francisco, CA, United States
## 72540     Downtown, Los Angeles, CA 90021, United States
## 75208 Mid-Wilshire, Los Angeles, CA 90006, United States
## 92585    Briarwood Lane, Austin, TX 78757, United States
## 96068          West Los Angeles, CA 90025, United States
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                      license
## 16246                                                                                                                                                  My place is in Long Beach downtown. Close to Pike, Shoreline Village, Queen Mary, Belmont Shore, and just around the corner from the night life scene.. Youâ\200\231ll love my place because of the comfy bed, the coziness, and the high ceilings. My place is good for couples, solo adventurers, and business travelers.
## 30584 Completely, beautifully renovated from floor to ceiling! Located in the heart of Westside- mins to Marina Del Rey, LMU, Playa Vista, Playa Del Rey, Mar Vista, Culver City, Venice Beach, Santa Monica.  With 2 parking spaces. Very quiet complex with pool, hot tub,  laundry onsite, bike access to the beach. Fully stocked kitchen with cookware. Family friendly for toddlers with new crib, high chair, and bath. HD television and high speed internet access.
## 47615                                                                                                                                                                                                                                                                                                    Two bedroom apartment in the middle of all Williamsburg's amazing restaurant, bar and cafe scene. 5 mins into Manhattan - between Marcy Ave JMZ line and Bedford L.
## 56281                                                                                                                                                                                                                                                     WELCOME HOME! Our spacious room is well lit with a city feel.  Close to The Grove, Downtown LA, Santa Monica, Beverly Hills, Silverlake.  As this is your home away from home, I will ensure you feel comfortable.
## 65792                                                                                                                                                                                                                                                                           Very nice rooms in a convenient location, close to Bart, Muni and highways. 15 minutes drive from the airport or to downtown SF. One queen bed and one futon that can occupy up to 4 people.
## 72540                                                                                                                                                                                                                              Spacious and beautiful, You will love this 3,500sqft loft. Located in an early 1900s industrial building, it features exposed brick, 25' high ceilings, Hue lights, a theatre with a 100"screen, Apple TV, steinway grand piano and more!
## 75208                                                                                                                                               Welcome Home! Our spacious room is well lit with a city feel! - Close to The Grove, Koreatown Plaza, Downtown LA, Santa Monica, Beverly Hills, Westwood - Convenient public transportation & most places are a cheap UBER ride away.  I will provide FREE COFFEE/TEA/CHOCOLATE and clean sheets + towels + robe for you.
## 92585                                                                                                                                                                                                                                                                                                                      This is the perfect place for art, animal and nature lovers in a charming and convenient neighborhood, with a well-traveled and friendly host. :)
## 96068                                                                                                                                                                                                             Gorgeous Ocean/Sunset view, Gated,24 hour security, parking,Beautiful Olympic pool,Gym,Walk to Brentwood restaurants, bars.Great Public Transport The studio has sofa-bed and private bathroom w/shower. 10 minutes to Santa Monica pier, 3rd street,UCLA.
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              longitude
## 16246                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             The place is a couple of blocks away from the blue line, Long Beach Transit Mall, and close to 710 and 405 freeways.
## 30584                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <NA>
## 47615                                                                                                                                                                                                                                                                                                                                                                                                                  There are various transportation options. We are at the footstep of the Williamsburg bridge so an Uber/Taxi into Manhattan is only 5 mins.  We are between the Marcy Ave (JMZ) and Bedford Ave (L) MTA Stops which are both one stop into Manhattan. Both stops are around 5mins away. There is also a citi bike rack at the end of our street.
## 56281                                                                                                                         Public transit is 3 blocks away if you want to get around LA via metro! Very fast and convenient way of traveling. 10 min walk to the Wiltern  10 min walk to the Purple Line (Wilshire/Normandie station) 5 min drive to Downtown LA/Staples Center  10 min drive to Hollywood Blvd. 20 min drive to Santa Monica and Venice beach and UCLA 25 min drive to LAX. There is street parking in my neighborhood, and no dedicated off-street parking space. We hope you understand that parking may be hard to find right away especially late at night, as it is a densely populated street. Lastly, our apartment is on the second floor!
## 65792                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           10 minutes walk to Bart station that can take you to downtown or airport. Muni M line two blocks away.
## 72540                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         Uber, Lyft and Sidecar are readily available to take you the 1 mile to the center of Downtown Los Angeles, where your entertainment possibilities are literally endless.
## 75208                                                                                                                                                                                   Public transit is 3 blocks away if you want to get around LA via metro! Very fast and convenient way of traveling. 10 min walk to the Wiltern  10 min walk to the Purple Line (Wilshire/Normandie station) 5 min drive to Downtown LA/Staples Center  10 min drive to Hollywood Blvd. 20 min drive to Santa Monica and Venice beach and UCLA 25 minutes driving to LAX. There is street parking if you need to park your car. As it is a densely populated street, parking may be hard to find. We are accepting guests who are using primarily Uber or public transportation.
## 92585 ****UPDATE: Uber and Lyft are temporarily suspended in Austin. Other options are GetMe, Wingz, Fare and RideAustin. I can't attest to their reliability or price as I have not yet used them. However, if prior arrangements are made and I am available, I will happily take you to and from the airport and drop you off around Austin for "donations" that you deem fair (please take into account my time and distance driving to and from your drop off place).  My house is about a 10-15 minute ride from almost anywhere in Austin, and about a 30 minute drive from the airport unless it's rush hour traffic. There is a bus stop a block away that will take you downtown in about 40 minutes. I also have a bicycle available to rent for $10 a day.
## 96068                                                                                                                                                                                                                                                                                                                                                                                                                                               If you are not driving...Public transportation is right at the corner, the bus schedules are easy to access online. These buses will get you to the beach in Venice and Santa Monica or even as far as Downtown Los Angeles or Malibu. You can take the bus to Santa Monica Beach in 15-20 minutes (3 miles away).
##          market maximum_nights minimum_nights monthly_price name
## 16246      <NA>          90813              0          <NA> <NA>
## 30584      <NA>          90066              0          <NA> <NA>
## 47615      <NA>          11211              0          <NA> <NA>
## 56281      <NA>          90006              0          <NA> <NA>
## 65792 $1,100.00          94112              0          <NA> <NA>
## 72540 $2,999.00          90021              0          <NA> <NA>
## 75208      <NA>          90006              0          <NA> <NA>
## 92585      <NA>          78757              1          <NA> <NA>
## 96068   $750.00          90025              0          <NA> <NA>
##       neighborhood_overview neighbourhood notes price property_type
## 16246                  <NA>          <NA>  <NA>  <NA>          <NA>
## 30584                  <NA>          <NA>  <NA>  <NA>          <NA>
## 47615                  <NA>          <NA>  <NA>  <NA>          <NA>
## 56281                  <NA>          <NA>  <NA>  <NA>          <NA>
## 65792                  <NA>          <NA>  <NA>  <NA>          <NA>
## 72540                  <NA>          <NA>  <NA>  <NA>          <NA>
## 75208                  <NA>          <NA>  <NA>  <NA>          <NA>
## 92585                  <NA>          <NA>  <NA>  <NA>          <NA>
## 96068                  <NA>          <NA>  <NA>  <NA>          <NA>
##       require_guest_phone_verification require_guest_profile_picture
## 16246                             <NA>                          <NA>
## 30584                             <NA>                          <NA>
## 47615                             <NA>                          <NA>
## 56281                             <NA>                          <NA>
## 65792                             <NA>                          <NA>
## 72540                             <NA>                          <NA>
## 75208                             <NA>                          <NA>
## 92585                             <NA>                          <NA>
## 96068                             <NA>                          <NA>
##       requires_license room_type security_deposit smart_location space
## 16246             <NA>      <NA>             <NA>           <NA>  <NA>
## 30584             <NA>      <NA>             <NA>           <NA>  <NA>
## 47615             <NA>      <NA>             <NA>           <NA>  <NA>
## 56281             <NA>      <NA>             <NA>           <NA>  <NA>
## 65792             <NA>      <NA>             <NA>           <NA>  <NA>
## 72540             <NA>      <NA>             <NA>           <NA>  <NA>
## 75208             <NA>      <NA>             <NA>           <NA>  <NA>
## 92585             <NA>      <NA>             <NA>           <NA>  <NA>
## 96068             <NA>      <NA>             <NA>           <NA>  <NA>
##       square_feet state street summary transit weekly_price zipcode
## 16246          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 30584          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 47615          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 56281          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 65792          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 72540          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 75208          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 92585          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 96068          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>

All the columns seem to be misplaced. We can store this separately and rearrange them -

misplacedX <- c(16246, 30584, 47615, 56281, 65792, 72540, 75208, 92585, 96068)

misplaced_df_x <- df_x[df_x$extra_people %in% extra_people_discrepancies,]
df_x[df_x$extra_people %in% extra_people_discrepancies, 3:70] <- NA
df_x[df_x$X %in% misplacedX, ]
##           X access accommodates amenities availability_30 availability_365
## 16246 16246   <NA>         <NA>      <NA>            <NA>               NA
## 30584 30584   <NA>         <NA>      <NA>            <NA>               NA
## 47615 47615   <NA>         <NA>      <NA>            <NA>               NA
## 56281 56281   <NA>         <NA>      <NA>            <NA>               NA
## 65792 65792   <NA>         <NA>      <NA>            <NA>               NA
## 72540 72540   <NA>         <NA>      <NA>            <NA>               NA
## 75208 75208   <NA>         <NA>      <NA>            <NA>               NA
## 92585 92585   <NA>         <NA>      <NA>            <NA>               NA
## 96068 96068   <NA>         <NA>      <NA>            <NA>               NA
##       availability_60 availability_90 bathrooms bed_type bedrooms beds
## 16246            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 30584            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 47615            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 56281            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 65792            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 72540            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 75208            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 92585            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
## 96068            <NA>            <NA>      <NA>     <NA>     <NA> <NA>
##       cancellation_policy city city_name cleaning_fee country country_code
## 16246                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 30584                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 47615                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 56281                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 65792                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 72540                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 75208                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 92585                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
## 96068                <NA> <NA>      <NA>         <NA>    <NA>         <NA>
##       description experiences_offered extra_people first_review guests_included
## 16246        <NA>                <NA>         <NA>         <NA>              NA
## 30584        <NA>                <NA>         <NA>         <NA>              NA
## 47615        <NA>                <NA>         <NA>         <NA>              NA
## 56281        <NA>                <NA>         <NA>         <NA>              NA
## 65792        <NA>                <NA>         <NA>         <NA>              NA
## 72540        <NA>                <NA>         <NA>         <NA>              NA
## 75208        <NA>                <NA>         <NA>         <NA>              NA
## 92585        <NA>                <NA>         <NA>         <NA>              NA
## 96068        <NA>                <NA>         <NA>         <NA>              NA
##       host_about host_acceptance_rate host_has_profile_pic
## 16246       <NA>                 <NA>                 <NA>
## 30584       <NA>                 <NA>                 <NA>
## 47615       <NA>                 <NA>                 <NA>
## 56281       <NA>                 <NA>                 <NA>
## 65792       <NA>                 <NA>                 <NA>
## 72540       <NA>                 <NA>                 <NA>
## 75208       <NA>                 <NA>                 <NA>
## 92585       <NA>                 <NA>                 <NA>
## 96068       <NA>                 <NA>                 <NA>
##       host_identity_verified host_is_superhost host_listings_count
## 16246                   <NA>              <NA>                <NA>
## 30584                   <NA>              <NA>                <NA>
## 47615                   <NA>              <NA>                <NA>
## 56281                   <NA>              <NA>                <NA>
## 65792                   <NA>              <NA>                <NA>
## 72540                   <NA>              <NA>                <NA>
## 75208                   <NA>              <NA>                <NA>
## 92585                   <NA>              <NA>                <NA>
## 96068                   <NA>              <NA>                <NA>
##       host_location host_name host_neighbourhood host_response_rate
## 16246          <NA>      <NA>               <NA>               <NA>
## 30584          <NA>      <NA>               <NA>               <NA>
## 47615          <NA>      <NA>               <NA>               <NA>
## 56281          <NA>      <NA>               <NA>               <NA>
## 65792          <NA>      <NA>               <NA>               <NA>
## 72540          <NA>      <NA>               <NA>               <NA>
## 75208          <NA>      <NA>               <NA>               <NA>
## 92585          <NA>      <NA>               <NA>               <NA>
## 96068          <NA>      <NA>               <NA>               <NA>
##       host_response_time host_since host_total_listings_count
## 16246               <NA>       <NA>                      <NA>
## 30584               <NA>       <NA>                      <NA>
## 47615               <NA>       <NA>                      <NA>
## 56281               <NA>       <NA>                      <NA>
## 65792               <NA>       <NA>                      <NA>
## 72540               <NA>       <NA>                      <NA>
## 75208               <NA>       <NA>                      <NA>
## 92585               <NA>       <NA>                      <NA>
## 96068               <NA>       <NA>                      <NA>
##       host_verifications house_rules instant_bookable interaction
## 16246               <NA>        <NA>             <NA>        <NA>
## 30584               <NA>        <NA>             <NA>        <NA>
## 47615               <NA>        <NA>             <NA>        <NA>
## 56281               <NA>        <NA>             <NA>        <NA>
## 65792               <NA>        <NA>             <NA>        <NA>
## 72540               <NA>        <NA>             <NA>        <NA>
## 75208               <NA>        <NA>             <NA>        <NA>
## 92585               <NA>        <NA>             <NA>        <NA>
## 96068               <NA>        <NA>             <NA>        <NA>
##       is_business_travel_ready is_location_exact jurisdiction_names latitude
## 16246                     <NA>              <NA>               <NA>     <NA>
## 30584                     <NA>              <NA>               <NA>     <NA>
## 47615                     <NA>              <NA>               <NA>     <NA>
## 56281                     <NA>              <NA>               <NA>     <NA>
## 65792                     <NA>              <NA>               <NA>     <NA>
## 72540                     <NA>              <NA>               <NA>     <NA>
## 75208                     <NA>              <NA>               <NA>     <NA>
## 92585                     <NA>              <NA>               <NA>     <NA>
## 96068                     <NA>              <NA>               <NA>     <NA>
##       license longitude market maximum_nights minimum_nights monthly_price name
## 16246    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 30584    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 47615    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 56281    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 65792    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 72540    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 75208    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 92585    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
## 96068    <NA>      <NA>   <NA>             NA             NA          <NA> <NA>
##       neighborhood_overview neighbourhood notes price property_type
## 16246                  <NA>          <NA>  <NA>  <NA>          <NA>
## 30584                  <NA>          <NA>  <NA>  <NA>          <NA>
## 47615                  <NA>          <NA>  <NA>  <NA>          <NA>
## 56281                  <NA>          <NA>  <NA>  <NA>          <NA>
## 65792                  <NA>          <NA>  <NA>  <NA>          <NA>
## 72540                  <NA>          <NA>  <NA>  <NA>          <NA>
## 75208                  <NA>          <NA>  <NA>  <NA>          <NA>
## 92585                  <NA>          <NA>  <NA>  <NA>          <NA>
## 96068                  <NA>          <NA>  <NA>  <NA>          <NA>
##       require_guest_phone_verification require_guest_profile_picture
## 16246                             <NA>                          <NA>
## 30584                             <NA>                          <NA>
## 47615                             <NA>                          <NA>
## 56281                             <NA>                          <NA>
## 65792                             <NA>                          <NA>
## 72540                             <NA>                          <NA>
## 75208                             <NA>                          <NA>
## 92585                             <NA>                          <NA>
## 96068                             <NA>                          <NA>
##       requires_license room_type security_deposit smart_location space
## 16246             <NA>      <NA>             <NA>           <NA>  <NA>
## 30584             <NA>      <NA>             <NA>           <NA>  <NA>
## 47615             <NA>      <NA>             <NA>           <NA>  <NA>
## 56281             <NA>      <NA>             <NA>           <NA>  <NA>
## 65792             <NA>      <NA>             <NA>           <NA>  <NA>
## 72540             <NA>      <NA>             <NA>           <NA>  <NA>
## 75208             <NA>      <NA>             <NA>           <NA>  <NA>
## 92585             <NA>      <NA>             <NA>           <NA>  <NA>
## 96068             <NA>      <NA>             <NA>           <NA>  <NA>
##       square_feet state street summary transit weekly_price zipcode
## 16246          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 30584          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 47615          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 56281          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 65792          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 72540          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 75208          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 92585          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>
## 96068          NA  <NA>   <NA>    <NA>    <NA>         <NA>    <NA>

Now we can work on transforming the data -

# First, the columns which have prices -

# cleaning_fee, extra_people, price, security_deposit, weekly_price, monthly_price

df_x$cleaning_fee <- as.numeric(str_replace_all(df_x$cleaning_fee, "[\\$,]", ""))
  # test data
tdf$cleaning_fee <- as.numeric(str_replace_all(tdf$cleaning_fee, "[\\$,]", ""))
## Warning: NAs introduced by coercion
df_x$extra_people <- as.numeric(str_replace_all(df_x$extra_people, "[\\$,]", ""))
  # test data
tdf$extra_people <- as.numeric(str_replace_all(tdf$extra_people, "[\\$,]", ""))

df_x$price <- as.numeric(str_replace_all(df_x$price, "[\\$,]", ""))
  # test data
tdf$price <- as.numeric(str_replace_all(tdf$price, "[\\$,]", ""))

df_x$security_deposit <- as.numeric(str_replace_all(df_x$security_deposit, "[\\$,]", ""))
  # test data
tdf$security_deposit <- as.numeric(str_replace_all(tdf$security_deposit, "[\\$,]", ""))

df_x$weekly_price <- as.numeric(str_replace_all(df_x$weekly_price, "[\\$,]", ""))
  # test data
tdf$weekly_price <- as.numeric(str_replace_all(tdf$weekly_price, "[\\$,]", ""))

df_x$monthly_price <- as.numeric(str_replace_all(df_x$monthly_price, "[\\$,]", ""))
  # test data
tdf$monthly_price <- as.numeric(str_replace_all(tdf$monthly_price, "[\\$,]", ""))

# Next, the columns which havedf_x$host_acceptance_rate - has %

# host_acceptance_rate, host_response_rate

df_x$host_acceptance_rate <- as.numeric(str_replace_all(df_x$host_acceptance_rate, "%", ""))
  # test data
tdf$host_acceptance_rate <- as.numeric(str_replace_all(tdf$host_acceptance_rate, "%", ""))

df_x$host_response_rate <- as.numeric(str_replace_all(df_x$host_response_rate, "%", ""))
  # test data
tdf$host_response_rate <- as.numeric(str_replace_all(tdf$host_response_rate, "%", ""))
## Warning: NAs introduced by coercion

The other columns now -

1 accommodates Factor 2 availability_30 Factor 3 availability_365 int 4 availability_60 Factor 5 availability_90 Factor 6 bathrooms Factor 7 bedrooms Factor 8 beds Factor # 9 cleaning_fee Factor - (needs to be cleaned - has $ symbol like $250.00) # 10 extra_people Factor - (needs to be cleaned - has $ symbol like $250.00) 11 guests_included num # 12 host_acceptance_rate Factor - (needs to be cleaned - has format like so - 100%) 13 host_listings_count Factor # 14 host_response_rate Factor - (needs to be cleaned - has format like so - 100%) 15 host_total_listings_count Factor 16 latitude Factor 17 longitude Factor 18 maximum_nights num 19 minimum_nights int # 20 monthly_price Factor - (needs to be cleaned - has $ symbol like $250.00) # 21 price Factor - (needs to be cleaned - has $ symbol like $250.00) # 22 security_deposit Factor - (needs to be cleaned - has $ symbol like $250.00) 23 square_feet int # 24 weekly_price Factor - (needs to be cleaned - has $ symbol like $250.00)

qplot(as.numeric(accommodates), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_30), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_365), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_60), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_90), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(bathrooms), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 254 rows containing non-finite values (stat_bin).

qplot(as.numeric(bedrooms), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 101 rows containing non-finite values (stat_bin).

qplot(as.numeric(beds), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 92 rows containing non-finite values (stat_bin).

qplot(as.numeric(guests_included), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(host_listings_count), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).

qplot(as.numeric(host_total_listings_count), data = df_x) + coord_flip()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).

# qplot(longitude, data = df_x)
# qplot(latitude, data = df_x)
qplot(as.numeric(maximum_nights), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

qplot(as.numeric(minimum_nights), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

qplot(as.numeric(square_feet), data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98419 rows containing non-finite values (stat_bin).

Visualize in log scale -

qplot(as.numeric(accommodates), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_30), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_365), data = df_x) + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16728 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_60), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(availability_90), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).

qplot(as.numeric(bathrooms), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 254 rows containing non-finite values (stat_bin).

qplot(as.numeric(bedrooms), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 101 rows containing non-finite values (stat_bin).

qplot(as.numeric(beds), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 92 rows containing non-finite values (stat_bin).

qplot(as.numeric(guests_included), data = df_x) + coord_flip() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 365 rows containing non-finite values (stat_bin).

qplot(as.numeric(host_listings_count), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).

qplot(as.numeric(host_total_listings_count), data = df_x) + coord_flip() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).

# qplot(longitude, data = df_x)
# qplot(latitude, data = df_x)
qplot(as.numeric(maximum_nights), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

qplot(as.numeric(minimum_nights), data = df_x) + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

qplot(as.numeric(square_feet), data = df_x) + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98536 rows containing non-finite values (stat_bin).

Convert these columns to numeric

num_cols <- c("accommodates", "availability_30", "availability_365", "availability_60", 
              "availability_90", "bathrooms", "bedrooms", "beds", "guests_included", 
              "host_listings_count", "host_total_listings_count", 
              "maximum_nights", "minimum_nights", "square_feet")

geo_cols <- c("latitude", "longitude")

df_x[,num_cols] <- sapply(df_x[,num_cols],as.numeric)
  # test data
tdf[,num_cols] <- sapply(tdf[,num_cols],as.numeric)

df_x[,geo_cols] <- sapply(df_x[,geo_cols],as.character)
df_x[,geo_cols] <- sapply(df_x[,geo_cols],as.numeric)
  # test data
tdf[,geo_cols] <- sapply(tdf[,geo_cols],as.character)
tdf[,geo_cols] <- sapply(tdf[,geo_cols],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

Observations for these variables -

  1. availability_30
  1. availability_365
  1. availability_60
  1. availability_90
  1. Bathrooms
  1. Bedrooms
  1. guests_included
  1. accommodates
  1. host_listings_count
  1. maximum_nights
  1. minimum_nights
  1. latitude, longitude
ggplot(gather(df_x[, num_cols]), aes(value)) + 
    geom_histogram(stat = "count") + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 99260 rows containing non-finite values (stat_count).

Now a look at categorical variables -

1 bed_type Factor

2 cancellation_policy Factor

3 city Factor

4 city_name Factor

5 country Factor

6 country_code Factor

7 experiences_offered Factor

8 host_has_profile_pic Factor 9 host_identity_verified Factor 10 host_is_superhost Factor # 11 host_location Factor # 12 host_neighbourhood Factor # 13 host_response_time Factor 14 instant_bookable Factor 15 is_business_travel_ready Factor 16 is_location_exact Factor # 17 jurisdiction_names Factor # 18 license Factor # 19 market Factor # 20 neighbourhood Factor # 21 property_type Factor 22 require_guest_phone_verification Factor 23 require_guest_profile_picture Factor 24 requires_license Factor # 25 room_type Factor # 26 smart_location Factor # 27 state Factor

There are a bunch of columns which have ‘t’ and ‘f’

  1. df$host_has_profile_pic - factor
  2. df$host_identity_verified - factor
  3. df$host_is_superhost - factor
  4. df$instant_bookable - factor
  5. df$is_business_travel_ready - factor
  6. df$is_location_exact - factor
  7. df$require_guest_phone_verification - factor
  8. df$require_guest_profile_picture - factor
  9. df$requires_license - factor

First, histograms.

These columns seem to comprise mostly of ’f’s and ’t’s.

We need to figure out how to deal with this specific column since more than 45% of the values are missing. We’ll get to that but first, let’s take care of the other columns.

# View(table(tdf$host_has_profile_pic))
# View(table(tdf$host_identity_verified))
# View(table(tdf$host_is_superhost))
# View(table(tdf$instant_bookable))
# View(table(tdf$is_business_travel_ready))
# View(table(tdf$is_location_exact))
# View(table(tdf$require_guest_phone_verification))
# View(table(tdf$require_guest_profile_picture))
# View(table(tdf$requires_license))

Let’s fix the discrepancies in the test data

superhost_discrepancies <- c("Bed,Bath&Bike in Sunny Santa Monica", "Pristine Mid-Century Modern w 180° Canyon View!")

misplacedy <- c(775, 10274)

misplaced_tdf <- tdf[tdf$host_is_superhost %in% superhost_discrepancies, ]

tdf[tdf$host_is_superhost %in% superhost_discrepancies, 3:70] <- NA

tdf[tdf$X %in% misplacedy, ]
##           X access accommodates amenities availability_30 availability_365
## 775     775   <NA>           NA      <NA>              NA               NA
## 10274 10274   <NA>           NA      <NA>              NA               NA
##       availability_60 availability_90 bathrooms bed_type bedrooms beds
## 775                NA              NA        NA     <NA>       NA   NA
## 10274              NA              NA        NA     <NA>       NA   NA
##       cancellation_policy city city_name cleaning_fee country country_code
## 775                  <NA> <NA>      <NA>           NA    <NA>         <NA>
## 10274                <NA> <NA>      <NA>           NA    <NA>         <NA>
##       description experiences_offered extra_people first_review guests_included
## 775          <NA>                <NA>           NA         <NA>              NA
## 10274        <NA>                <NA>           NA         <NA>              NA
##       host_about host_acceptance_rate host_has_profile_pic
## 775         <NA>                   NA                 <NA>
## 10274       <NA>                   NA                 <NA>
##       host_identity_verified host_is_superhost host_listings_count
## 775                     <NA>              <NA>                  NA
## 10274                   <NA>              <NA>                  NA
##       host_location host_name host_neighbourhood host_response_rate
## 775            <NA>      <NA>               <NA>                 NA
## 10274          <NA>      <NA>               <NA>                 NA
##       host_response_time host_since host_total_listings_count
## 775                 <NA>       <NA>                        NA
## 10274               <NA>       <NA>                        NA
##       host_verifications house_rules instant_bookable interaction
## 775                 <NA>        <NA>             <NA>        <NA>
## 10274               <NA>        <NA>             <NA>        <NA>
##       is_business_travel_ready is_location_exact jurisdiction_names latitude
## 775                       <NA>              <NA>               <NA>       NA
## 10274                     <NA>              <NA>               <NA>       NA
##       license longitude market maximum_nights minimum_nights monthly_price name
## 775      <NA>        NA   <NA>             NA             NA            NA <NA>
## 10274    <NA>        NA   <NA>             NA             NA            NA <NA>
##       neighborhood_overview neighbourhood notes price property_type
## 775                    <NA>          <NA>  <NA>    NA          <NA>
## 10274                  <NA>          <NA>  <NA>    NA          <NA>
##       require_guest_phone_verification require_guest_profile_picture
## 775                               <NA>                          <NA>
## 10274                             <NA>                          <NA>
##       requires_license room_type security_deposit smart_location space
## 775               <NA>      <NA>               NA           <NA>  <NA>
## 10274             <NA>      <NA>               NA           <NA>  <NA>
##       square_feet state street summary transit weekly_price zipcode
## 775            NA  <NA>   <NA>    <NA>    <NA>           NA      NA
## 10274          NA  <NA>   <NA>    <NA>    <NA>           NA      NA

Convert t’s and f’s to 1s and 0s

# 1. df$host_has_profile_pic - factor
df_x$host_has_profile_pic <- ifelse(df_x_main$host_has_profile_pic == "t", 1, 0)
qplot(x = host_has_profile_pic, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 142 rows containing non-finite values (stat_bin).

  # test data
tdf$host_has_profile_pic <- ifelse(tdf_x_main$host_has_profile_pic == "t", 1, 0)

# 2. df$host_identity_verified - factor
df_x$host_identity_verified <- ifelse(df_x_main$host_identity_verified == "t", 1, 0)
qplot(x = host_identity_verified, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 148 rows containing non-finite values (stat_bin).

  # test data
tdf$host_identity_verified <- ifelse(tdf_x_main$host_identity_verified == "t", 1, 0)

# 3. df$host_is_superhost - factor
df_x$host_is_superhost <- ifelse(df_x_main$host_is_superhost== "t", 1, 0)
qplot(x = host_is_superhost, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 142 rows containing non-finite values (stat_bin).

  # test data
tdf$host_is_superhost <- ifelse(tdf_x_main$host_is_superhost== "t", 1, 0)

# 4. df$instant_bookable - factor
df_x$instant_bookable <- ifelse(df_x_main$instant_bookable == "t", 1, 0)
qplot(x = instant_bookable, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12 rows containing non-finite values (stat_bin).

  # test data
tdf$instant_bookable <- ifelse(tdf_x_main$instant_bookable == "t", 1, 0)  

# 5. df$is_business_travel_ready - factor
df_x$is_business_travel_ready <- ifelse(is.na(df_x_main$is_business_travel_ready), df_x_main$is_business_travel_ready, ifelse(df_x_main$is_business_travel_ready == "t", 1, 0))
qplot(x = is_business_travel_ready, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 44551 rows containing non-finite values (stat_bin).

  # test data
tdf$is_business_travel_ready <- ifelse(is.na(tdf_x_main$is_business_travel_ready), tdf_x_main$is_business_travel_ready, ifelse(tdf_x_main$is_business_travel_ready == "t", 1, 0))

# 6. df$is_location_exact - factor
df_x$is_location_exact <- ifelse(df_x_main$is_location_exact == "t", 1, 0)
qplot(x = is_location_exact, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

  # test data
tdf$is_location_exact <- ifelse(tdf_x_main$is_location_exact == "t", 1, 0)

# 7. df$require_guest_phone_verification - factor
df_x$require_guest_phone_verification <- ifelse(df_x_main$require_guest_phone_verification == "t", 1, 0)
qplot(x = require_guest_phone_verification, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

  # test data
tdf$require_guest_phone_verification <- ifelse(tdf_x_main$require_guest_phone_verification == "t", 1, 0)

# 8. df$require_guest_profile_picture - factor
df_x$require_guest_profile_picture <- ifelse(df_x_main$require_guest_profile_picture == "t", 1, 0)
qplot(x = require_guest_profile_picture, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

  # test data
tdf$require_guest_profile_picture <- ifelse(tdf_x_main$require_guest_profile_picture == "t", 1, 0)

# 9. df$requires_license - factor
df_x$requires_license <- ifelse(df_x_main$requires_license == "t", 1, 0)
qplot(x = requires_license, data = df_x)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

  # test data
tdf$requires_license <- ifelse(tdf_x_main$requires_license == "t", 1, 0)

Moving on to other categorical variables -

1 bed_type Factor 2 cancellation_policy Factor 3 city Factor 4 city_name Factor 5 country Factor 6 country_code Factor 7 experiences_offered Factor # 8 host_has_profile_pic Factor # 9 host_identity_verified Factor # 10 host_is_superhost Factor 11 host_location Factor 12 host_neighbourhood Factor 13 host_response_time Factor # 14 instant_bookable Factor # 15 is_business_travel_ready Factor # 16 is_location_exact Factor 17 jurisdiction_names Factor 18 license Factor 19 market Factor 20 neighbourhood Factor 21 property_type Factor # 22 require_guest_phone_verification Factor # 23 require_guest_profile_picture Factor # 24 requires_license Factor 25 room_type Factor 26 smart_location Factor 27 state Factor

qplot(df_x$bed_type) + coord_flip()

qplot(df_x$cancellation_policy) + coord_flip()

qplot(df_x$city) + coord_flip() # unclear - need to look at the table

qplot(df_x$city_name) + coord_flip()

qplot(df_x$country)+ coord_flip()

qplot(df_x$country_code)+ coord_flip()

qplot(df_x$experiences_offered)+ coord_flip()

qplot(df_x$host_location)+ coord_flip() # Unclear - need to look at the table

qplot(df_x$host_neighbourhood)+ coord_flip() # Unclear - need to look at the table

qplot(df_x$host_response_time)+ coord_flip()

qplot(df_x$jurisdiction_names)+ coord_flip() # Categorical list

qplot(df_x$license)+ coord_flip() # Unclear - need to look at the table

qplot(df_x$market)+ coord_flip() # Looks clean for the most part

qplot(df_x$neighbourhood)+ coord_flip() # Unclear - need to look at the table

qplot(df_x$property_type)+ coord_flip() # Looks neat for the most part

qplot(df_x$room_type)+ coord_flip() # clean

qplot(df_x$smart_location)+ coord_flip() # Unclear - need to look at the table

qplot(df_x$state)+ coord_flip() # needs to be cleaned

Table views for the variables with unclear plots -

# View(table(df_x$city)) # 738 categories
# View(table(df_x$host_location)) # 2154 categories; Format like so - Liverpool, England, United Kingdom
# View(table(df_x$host_neighbourhood)) # 1303 categories
# View(table(df_x$license)) # Looks like some unique ID - 7474 many categories
# View(table(df_x$neighbourhood)) # 1093 categories; Examples - Woodside, Windsor
# View(table(df_x$smart_location)) # 750 categories - Can be cleaned by converting all to upper case

First, let’s clean the state column -

library(rapportools) # for trim.space
## Warning: package 'rapportools' was built under R version 3.6.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 3.6.3
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
## 
##     stamp
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
## The following object is masked from 'package:dplyr':
## 
##     rename
## 
## Attaching package: 'rapportools'
## The following object is masked from 'package:dplyr':
## 
##     n
## The following objects are masked from 'package:stats':
## 
##     IQR, median, sd, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, range, sum
# View with all values in upper case

View(table(str_to_upper(df_x$state)))

# The data discrepancies are -

# 1. BAJA CALIFORNIA
# 2. NEW YORK
# 3. SECC TERRAZAS
# 4. MP

# Let's change that 

df_x$state <- trim.space(str_to_upper(df_x$state))

df_x$state[df_x$state == 'NEW YORK'] <- 'NY'
df_x$state[df_x$state == 'BAJA CALIFORNIA'] <- 'CA'
df_x$state[df_x$state == 'SECC TERRAZAS'] <- 'CA'
df_x$state[df_x$state == 'MP'] <- 'NY'

# There are only 17 samples for MD which can be changed to DC
# There are only 2 samples for NJ which can be changed to NY

df_x$state[df_x$state == 'MD'] <- 'DC'
df_x$state[df_x$state == 'NJ'] <- 'NY'

# There is a separate column for LA which can be changed to CA

df_x$state[df_x$state == 'LA'] <- 'CA'
unique(df_x$state)
##  [1] "TN" "CA" "DC" "NY" "WA" "IL" "MA" "TX" "OR" "CO" "NC" NA
  # test data

tdf$state <- trim.space(str_to_upper(tdf$state))
View(table(tdf$state))

tdf$state[tdf$state == 'LA'] <- 'CA'
tdf$state[tdf$state == 'MD'] <- 'DC'

unique(tdf$state)
##  [1] "NY" "CA" "OR" "TX" "CO" "IL" "TN" "DC" "WA" "MA" "NC" NA
# View(sort(table(tdf$city), decreasing = T)) 
# View(table(tdf$host_location)) 
# View(table(tdf$host_neighbourhood)) 
# View(table(tdf$license)) 
# View(table(tdf$neighbourhood)) 
# View(table(tdf$smart_location)) 

They are pretty much okay.

Let’s capitalize and trim these categorical columns -

# city
df_x$city <- trim.space(str_to_upper(df_x$city))
  # test data
tdf$city <- trim.space(str_to_upper(tdf$city))

# city_name
df_x$city_name <- trim.space(str_to_upper(df_x$city_name))
  # test data
tdf$city_name <- trim.space(str_to_upper(tdf$city_name))

# host_location
df_x$host_location <- trim.space(str_to_upper(df_x$host_location))
  # test data
tdf$host_location <- trim.space(str_to_upper(tdf$host_location))

# host_neighborhood
df_x$host_neighbourhood <- trim.space(str_to_upper(df_x$host_neighbourhood))
  # test data
tdf$host_neighbourhood <- trim.space(str_to_upper(tdf$host_neighbourhood))

# neighborhood
df_x$neighbourhood <- trim.space(str_to_upper(df_x$neighbourhood))
  # test data
tdf$neighbourhood <- trim.space(str_to_upper(tdf$neighbourhood))

# smart_location
df_x$smart_location <- trim.space(str_to_upper(df_x$smart_location))
  # test data
tdf$smart_location <- trim.space(str_to_upper(tdf$smart_location)) 

Now let’s view the number of categories for each of these variables and see if they have reduced -

length(unique(df_x_main$city)) # 739
## [1] 739
length(unique(df_x$city)) # 526
## [1] 526
length(unique(df_x_main$host_location)) # 2155
## [1] 2155
length(unique(df_x$host_location)) # 2118
## [1] 2118
length(unique(df_x_main$host_neighbourhood)) # 1304
## [1] 1304
length(unique(df_x$host_neighbourhood)) # 1295
## [1] 1295
length(unique(df_x_main$neighborhood)) # 62392
## [1] 62392
length(unique(df_x$neighborhood)) # 62392
## [1] 62392
length(unique(df_x_main$smart_location)) # 751
## [1] 751
length(unique(df_x$smart_location)) # 634
## [1] 634

The test data -

length(unique(tdf_x_main$city)) # 265
## [1] 265
length(unique(tdf$city)) # 221
## [1] 221
length(unique(tdf_x_main$host_location)) # 600
## [1] 600
length(unique(tdf$host_location)) # 593
## [1] 593
length(unique(tdf_x_main$host_neighbourhood)) # 879
## [1] 879
length(unique(tdf$host_neighbourhood)) # 877
## [1] 877
length(unique(tdf_x_main$neighborhood)) # 8194
## [1] 8194
length(unique(tdf$neighborhood)) # 8194
## [1] 8194
length(unique(tdf_x_main$smart_location)) # 268
## [1] 268
length(unique(tdf$smart_location)) # 250
## [1] 250

Now, the date columns; Let’s convert them to date type; And then extract the number of years of experience the listing’s host has had as a host; We shall also find the number of years since becoming a host did it take for them to get their first review -

df_x <- df_x %>%
  mutate(first_review = ymd(first_review),
         host_since = ymd(host_since),
         host_age = 2020 - year(host_since),
         start_to_review_gap = year(first_review) - year(host_since))

  # test data
tdf <- tdf %>%
  mutate(first_review = ymd(first_review),
         host_since = ymd(host_since),
         host_age = 2020 - year(host_since),
         start_to_review_gap = year(first_review) - year(host_since))

View the distribution of the host_age and start_to_review_gap -

qplot(df_x$host_since, df_x$first_review)
## Warning: Removed 151 rows containing missing values (geom_point).

qplot(df_x$host_age)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).

qplot(df_x$start_to_review_gap)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 151 rows containing non-finite values (stat_bin).

Replace the NAs in start_to_review_gap and host_age with the respective means of the column

library(tidyr)

sTor_gap <- mean(df_x$start_to_review_gap)
df_x$start_to_review_gap <- replace_na(df_x$start_to_review_gap, sTor_gap)
  # test data
tdf$start_to_review_gap <- replace_na(tdf$start_to_review_gap, sTor_gap)

host_age_mean <- mean(df_x$host_age)
df_x$host_age <- replace_na(df_x$host_age, host_age_mean)
  # test data
tdf$host_age <- replace_na(tdf$start_to_review_gap, host_age_mean)

There are some start_to_review_gaps which are lesser than zero (where the first_review date is earlier than the host_since date)

Let’s clean it up

df_x$start_to_review_gap[df_x$start_to_review_gap < 0] <- sTor_gap
  # test data
tdf$start_to_review_gap[tdf$start_to_review_gap < 0] <- sTor_gap

Let’s bucket these variables based on the quantiles -

# host_age / experience

df_x <- df_x %>% 
  mutate(host_age_q = ntile(host_age, 5))


# df_x %>% 
#   mutate(host_age_q = ntile(host_age, 5)) %>%
#   group_by(host_age_q, host_age) %>%
#   summarize(n = n()) %>%
#   group_by(host_age_q) %>%
#   summarize(total = sum(n),
#             min_age = min(host_age),
#             max_age = max(host_age))

# 1 20000   2   4   
# 2 20000   4   5   
# 3 20000   5   6   
# 4 20000   6   8   
# 5 20000   8   12  


df_x <- df_x %>%    
    mutate( 
        host_lowExp = ifelse(host_age_q == 1, 1, 0),
        host_lowMedExp = ifelse(host_age_q == 2, 1, 0),
        host_medExp = ifelse(host_age_q == 3, 1, 0),
        host_medHighExp = ifelse(host_age_q == 4, 1, 0),
        host_highExp = ifelse(host_age_q == 5, 1, 0)
        )

tdf <- tdf %>%
  mutate(
      host_lowExp = ifelse((host_age >= 2) & (host_age < 4), 1, 0),
        host_lowMedExp = ifelse((host_age >= 4) & (host_age < 5), 1, 0),
        host_medExp = ifelse((host_age >= 5) & (host_age < 6), 1, 0),
        host_medHighExp = ifelse((host_age >= 6) & (host_age < 8), 1, 0),
        host_highExp = ifelse((host_age >= 8) & (host_age < 12), 1, 0) 
  )

Now the start_to_review_gap -

Let’s first check the outliers to impute -

sTorR_OutVals <- boxplot(df_x$start_to_review_gap)$out

sTorR_OutVals
##   [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8
##  [38] 8 8 9 8 8 8 8 8 8 8 8 8 8 9 9 8 9 8 8 8 8 8 8 8 8 8 8 9 8 8 9 9 8 8 8 8 9
##  [75] 8 8 8 8 8 8 9 8 8 8 9 8 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8
## [112] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [149] 8 8 9 8 8 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 9 9
sTorR_TestOutVals <- boxplot(tdf$start_to_review_gap)$out

sTorR_TestOutVals
##  [1] 8 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 8 8 8 9 8

Change the outliers -

df_x$start_to_review_gap[df_x$start_to_review_gap > 7] <- 6
tdf$start_to_review_gap[tdf$start_to_review_gap > 7] <- 6
df_x <- df_x %>% 
  mutate(sTor_gap_q = ntile(start_to_review_gap, 3))


# df_x %>%
#   mutate(sTor_gap_q = ntile(start_to_review_gap, 3)) %>%
#   group_by(sTor_gap_q, start_to_review_gap) %>%
#   summarize(n = n()) %>%
#   group_by(sTor_gap_q) %>%
#   summarize(total = sum(n),
#             min_age = min(start_to_review_gap),
#             max_age = max(start_to_review_gap))

# 1 33334   0   1   
# 2 33333   1   2   
# 3 33333   2   7   

df_x <- df_x %>%
    mutate(
        NoGap_sTor = ifelse(sTor_gap_q == 1, 1, 0),
        Yr1Gap_sTor = ifelse(sTor_gap_q == 2, 1, 0),
        HighGap_sTor = ifelse(sTor_gap_q == 3, 1, 0)
        )

tdf <- tdf %>%
  mutate(
      NoGap_sTor = ifelse(start_to_review_gap == 0, 1, 0),
        Yr1Gap_sTor = ifelse(start_to_review_gap == 1, 1, 0),
        HighGap_sTor = ifelse(start_to_review_gap >= 2, 1, 0)
  )

Dummy variables for state -

df_x <- df_x %>%
    mutate(
        In_CA = ifelse(state == 'CA', 1, 0),
        In_CO = ifelse(state == 'CO', 1, 0),
        In_DC = ifelse(state == 'DC', 1, 0),
        In_IL = ifelse(state == 'IL', 1, 0),
        In_MA = ifelse(state == 'MA', 1, 0),
        In_NC = ifelse(state == 'NC', 1, 0),
        In_NY = ifelse(state == 'NY', 1, 0),
        In_OR = ifelse(state == 'OR', 1, 0),
        In_TN = ifelse(state == 'TN', 1, 0),
        In_TX = ifelse(state == 'TX', 1, 0),
        In_WA = ifelse(state == 'WA', 1, 0)
        )
  # test data

tdf <- tdf %>%
    mutate(
        In_CA = ifelse(state == 'CA', 1, 0),
        In_CO = ifelse(state == 'CO', 1, 0),
        In_DC = ifelse(state == 'DC', 1, 0),
        In_IL = ifelse(state == 'IL', 1, 0),
        In_MA = ifelse(state == 'MA', 1, 0),
        In_NC = ifelse(state == 'NC', 1, 0),
        In_NY = ifelse(state == 'NY', 1, 0),
        In_OR = ifelse(state == 'OR', 1, 0),
        In_TN = ifelse(state == 'TN', 1, 0),
        In_TX = ifelse(state == 'TX', 1, 0),
        In_WA = ifelse(state == 'WA', 1, 0)
        )

Let’s also include city_name dummies. There might be more variance with these because parameters might change based on cities rather than state on the whole -

unique(df_x$city_name)
##  [1] "NASHVILLE"     "LOS ANGELES"   "SAN DIEGO"     "WASHINGTON DC"
##  [5] "NEW ORLEANS"   "SAN FRANCISCO" "NEW YORK"      "SEATTLE"      
##  [9] "CHICAGO"       "BOSTON"        "AUSTIN"        "PORTLAND"     
## [13] "DENVER"        "SANTA CRUZ"    "OAKLAND"       "ASHEVILLE"    
## [17] NA
unique(tdf$city_name)
##  [1] "NEW YORK"      "LOS ANGELES"   "NEW ORLEANS"   "PORTLAND"     
##  [5] "AUSTIN"        "SAN DIEGO"     "DENVER"        "CHICAGO"      
##  [9] "OAKLAND"       "NASHVILLE"     "WASHINGTON DC" "SEATTLE"      
## [13] "SAN FRANCISCO" "BOSTON"        "ASHEVILLE"     "SANTA CRUZ"   
## [17] NA

Let’s create dummies then,

df_x <- df_x %>%
    mutate(
        In_NAS = ifelse(city_name == 'NASHVILLE', 1, 0),
        In_LA = ifelse(city_name == 'LOS ANGELES', 1, 0),
        In_SD = ifelse(city_name == 'SAN DIEGO', 1, 0),
        In_WDC = ifelse(city_name == 'WASHINGTON DC', 1, 0),
        In_NWO = ifelse(city_name == 'NEW ORLEANS', 1, 0),
        In_SFO = ifelse(city_name == 'SAN FRANCISCO', 1, 0),
        In_NYC = ifelse(city_name == 'NEW YORK', 1, 0),
        In_SEA = ifelse(city_name == 'SEATTLE', 1, 0),
        In_CHI = ifelse(city_name == 'CHICAGO', 1, 0),
        In_BOS = ifelse(city_name == 'BOSTON', 1, 0),
        In_AUS = ifelse(city_name == 'AUSTIN', 1, 0),
        In_PO = ifelse(city_name == 'PORTLAND', 1, 0),
        In_DEN = ifelse(city_name == 'DENVER', 1, 0),
        In_SCU = ifelse(city_name == 'SANTA CRUZ', 1, 0),
        In_OAK = ifelse(city_name == 'OAKLAND', 1, 0),
        In_ASH = ifelse(city_name == 'ASHEVILLE', 1, 0)
        )
  # test data

tdf <- tdf %>%
    mutate(
        In_NAS = ifelse(city_name == 'NASHVILLE', 1, 0),
        In_LA = ifelse(city_name == 'LOS ANGELES', 1, 0),
        In_SD = ifelse(city_name == 'SAN DIEGO', 1, 0),
        In_WDC = ifelse(city_name == 'WASHINGTON DC', 1, 0),
        In_NWO = ifelse(city_name == 'NEW ORLEANS', 1, 0),
        In_SFO = ifelse(city_name == 'SAN FRANCISCO', 1, 0),
        In_NYC = ifelse(city_name == 'NEW YORK', 1, 0),
        In_SEA = ifelse(city_name == 'SEATTLE', 1, 0),
        In_CHI = ifelse(city_name == 'CHICAGO', 1, 0),
        In_BOS = ifelse(city_name == 'BOSTON', 1, 0),
        In_AUS = ifelse(city_name == 'AUSTIN', 1, 0),
        In_PO = ifelse(city_name == 'PORTLAND', 1, 0),
        In_DEN = ifelse(city_name == 'DENVER', 1, 0),
        In_SCU = ifelse(city_name == 'SANTA CRUZ', 1, 0),
        In_OAK = ifelse(city_name == 'OAKLAND', 1, 0),
        In_ASH = ifelse(city_name == 'ASHEVILLE', 1, 0)
        )

Cancellation Policy -

# View(table(df_x$cancellation_policy))

# 7 strict  47372
# 5 moderate    30411
# 4 flexible    21837
# 8 super_strict_30 261
# 9 super_strict_60 106
# 6 no_refunds  4

strict <- c("no_refunds", "super_strict_60", "super_strict_30", "strict")


df_x <- df_x %>%
    mutate(
        CancelPol_Strict = ifelse(cancellation_policy %in% strict, 1, 0),
        CancelPol_Mod = ifelse(cancellation_policy == 'moderate', 1, 0),
        CancelPol_Flexy = ifelse(cancellation_policy == 'flexible', 1, 0)
        )
  # test data
tdf <- tdf %>%
    mutate(
        CancelPol_Strict = ifelse(cancellation_policy %in% strict, 1, 0),
        CancelPol_Mod = ifelse(cancellation_policy == 'moderate', 1, 0),
        CancelPol_Flexy = ifelse(cancellation_policy == 'flexible', 1, 0)
        )

Room Type and Property Type -

# View(table(df_x$room_type))

# 1 Entire home/apt 60892
# 2 Private room    36617
# 3 Shared room 2472

# View(table(tdf$room_type))

# 1 Entire home/apt 7425
# 2 Private room    4480
# 3 Shared room 300

df_x <- df_x %>%
    mutate(
        WholePlace = ifelse(room_type == 'Entire home/apt', 1, 0),
        PrivateRoom = ifelse(room_type == 'Private room', 1, 0),
        SharedRoom = ifelse(room_type == 'Shared room', 1, 0)
        )
  # test data
tdf <- tdf %>%
    mutate(
        WholePlace = ifelse(room_type == 'Entire home/apt', 1, 0),
        PrivateRoom = ifelse(room_type == 'Private room', 1, 0),
        SharedRoom = ifelse(room_type == 'Shared room', 1, 0)
        )

Property type -

df_x$property_type <- trim.space(str_to_upper(df_x$property_type))
tdf$property_type <- trim.space(str_to_upper(tdf$property_type))

# View(table(df_x$property_type))

apartment_types = c("APARTMENT", "CONDOMINIUM", "LOFT", "IN-LAW")
house_types = c("HOUSE", "TOWNHOUSE", "GUESTHOUSE", "TINY HOUSE", "BUNGALOW", "VILLA", "CASA PARTICULAR (CUBA)")
hotel_types = c("BED & BREAKFAST", "BED AND BREAKFAST", "SERVICED APARTMENT", "HOTEL", "APARTHOTEL", "GUEST SUITE")
holiday_types = c("RESORT", "VACATION HOME", "CABIN", "CHALET", "TIMESHARE", "BOAT", "TREEHOUSE", "YURT", "COTTAGE", "HUT", "ISLAND", "BOUTIQUE HOTEL", "CASTLE", "EARTH HOUSE", "CAVE", "TRAIN", "FARM STAY", "BARN", "LIGHTHOUSE", "NATURE LODGE", "PLANE")
other_types = c("OTHER", "DORM", "CAMPER/RV", "HOSTEL", "TENT", "TIPI")

df_x <- df_x %>%
    mutate(
        propertyApartment = ifelse(property_type %in% apartment_types, 1, 0),
        propertyHouse = ifelse(property_type %in% house_types, 1, 0),
        propertyHotel = ifelse(property_type %in% hotel_types, 1, 0),
        propertyHoliday = ifelse(property_type %in% holiday_types, 1, 0),
        propertyOther = ifelse(property_type %in% other_types, 1, 0)
        )
  # test data
# View(table(tdf$property_type))

tdf <- tdf %>%
    mutate(
        propertyApartment = ifelse(property_type %in% apartment_types, 1, 0),
        propertyHouse = ifelse(property_type %in% house_types, 1, 0),
        propertyHotel = ifelse(property_type %in% hotel_types, 1, 0),
        propertyHoliday = ifelse(property_type %in% holiday_types, 1, 0),
        propertyOther = ifelse(property_type %in% other_types, 1, 0)
        )

Bed Type -

# View(table(df_x$bed_type))
# View(table(tdf$bed_type))
# View(table(df_x$bed_type, df_y$high_booking_rate))
  
# 3   Airbed    0   449
# 4   Couch 0   213
# 5   Futon 0   805
# 6   Pull-out Sofa 0   556
# 7   Real Bed  0   72828
# 
# 10    Airbed  1   89
# 11    Couch   1   48
# 12    Futon   1   191
# 13    Pull-out Sofa   1   164
# 14    Real Bed    1   24638

uncomfyBeds = c("Airbed", "Couch", "Futon", "Pull-out Sofa")

df_x <- df_x %>%
    mutate(
        bed_Real = ifelse(bed_type == 'Real Bed', 1, 0),
        bed_Uncomfy = ifelse(bed_type %in% uncomfyBeds, 1, 0)
        )
  # test data
tdf <- tdf %>%
    mutate(
        bed_Real = ifelse(bed_type == 'Real Bed', 1, 0),
        bed_Uncomfy = ifelse(bed_type %in% uncomfyBeds, 1, 0)
        )

host_response_time

# View(table(df_x$host_response_time, df_y$high_booking_rate))
# View(table(df_x$host_response_time))
# View(table(tdf$host_response_time))

# 1 a few days or more  0   1350
# 2 f   0   0
# 3 within a day    0   9800
# 4 within a few hours  0   14333
# 5 within an hour  0   33865
# 6 a few days or more  1   55
# 7 f   1   0
# 8 within a day    1   857
# 9 within a few hours  1   2929
# 10    within an hour  1   21008

# replacing na with the second most popular category
df_x$host_response_time <- replace_na(df_x$host_response_time, 'within a few hours')

df_x <- df_x %>%
    mutate(
        hRes_FewDays = ifelse(host_response_time == 'a few days or more', 1, 0),
        hRes_1Day = ifelse(host_response_time == 'within a day', 1, 0),
        hRes_FewHours = ifelse(host_response_time == 'within a few hours', 1, 0),
        hRes_1Hour = ifelse(host_response_time == 'within an hour', 1, 0)
        )
  # test data
tdf <- tdf %>%
    mutate(
        hRes_FewDays = ifelse(host_response_time == 'a few days or more', 1, 0),
        hRes_1Day = ifelse(host_response_time == 'within a day', 1, 0),
        hRes_FewHours = ifelse(host_response_time == 'within a few hours', 1, 0),
        hRes_1Hour = ifelse(host_response_time == 'within an hour', 1, 0)
        )
View(df_x$host_verifications)
View(df_x$amenities)

Some text mining on the columns with categorical lists -

library(qdap)
## Warning: package 'qdap' was built under R version 3.6.3
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 3.6.3
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
## 
##     explain
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 3.6.3
## 
## Attaching package: 'qdapTools'
## The following object is masked from 'package:dplyr':
## 
##     id
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## The following object is masked from 'package:reshape':
## 
##     condense
## The following object is masked from 'package:forcats':
## 
##     %>%
## The following object is masked from 'package:stringr':
## 
##     %>%
## The following object is masked from 'package:purrr':
## 
##     %>%
## The following object is masked from 'package:tidyr':
## 
##     %>%
## The following object is masked from 'package:tibble':
## 
##     %>%
## The following object is masked from 'package:dplyr':
## 
##     %>%
## The following object is masked from 'package:base':
## 
##     Filter
df_x$host_verifications <- as.character(df_x$host_verifications)
df_x$host_verifications <- trim.space(str_to_lower(df_x$host_verifications))
freq_verifs <- freq_terms(df_x$host_verifications, 50)
plot(freq_verifs)

Dummies for number of host verifications

df_x <- df_x %>%
    mutate(
        No_of_Verifs = str_count(df_x$host_verifications, ",") + 1
        )

# View(table(df_x$No_of_Verifs))

# 1   1   241
# 2   2   1978
# 3   3   16077
# 4   4   35876
# 5   5   23967
# 6   6   10773
# 7   7   4552
# 8   8   3522
# 9   9   1984
# 10    10  707
# 11    11  217
# 12    12  71
# 13    13  14
# 14    14  2

df_x <- df_x %>%
    mutate(
        verifs_vLow = ifelse((No_of_Verifs >= 1) & (No_of_Verifs <= 2), 1, 0),
        verifs_Med = ifelse((No_of_Verifs >= 3) & (No_of_Verifs <= 5), 1, 0),
        verifs_High = ifelse((No_of_Verifs >= 6) & (No_of_Verifs <= 8), 1, 0),
        verifs_VHigh = ifelse(No_of_Verifs >= 9, 1, 0)
        )
  # test data
tdf <- tdf %>%
    mutate(
        No_of_Verifs = str_count(tdf$host_verifications, ",") + 1
        )

tdf <- tdf %>%
    mutate(
        verifs_vLow = ifelse((No_of_Verifs >= 1) & (No_of_Verifs <= 2), 1, 0),
        verifs_Med = ifelse((No_of_Verifs >= 3) & (No_of_Verifs <= 5), 1, 0),
        verifs_High = ifelse((No_of_Verifs >= 6) & (No_of_Verifs <= 8), 1, 0),
        verifs_VHigh = ifelse(No_of_Verifs >= 9, 1, 0)
        )

Create further dummies based on popular verification types -

# 1 phone 2 reviews 3 email 4 jumio 5 kba 6 facebook 7 governmentid 8 workemail 9 offlinegovernmentid 10 google

df_x <- df_x %>%
  mutate(
    facebookVer = ifelse(grepl("facebook", host_verifications), 1, 0),
    googleVer = ifelse(grepl("google", host_verifications), 1, 0),
    govtIDVer = ifelse(grepl("government_id", host_verifications), 1, 0),
    jumioVer = ifelse(grepl("jumio", host_verifications), 1, 0),
    kbaVer = ifelse(grepl("kba", host_verifications), 1, 0)
  )
  # test data

tdf <- tdf %>%
  mutate(
    facebookVer = ifelse(grepl("facebook", host_verifications), 1, 0),
    googleVer = ifelse(grepl("google", host_verifications), 1, 0),
    govtIDVer = ifelse(grepl("government_id", host_verifications), 1, 0),
    jumioVer = ifelse(grepl("jumio", host_verifications), 1, 0),
    kbaVer = ifelse(grepl("kba", host_verifications), 1, 0)
  )

Now, the harder one - amenities -

df_x$amenities <- as.character(df_x$amenities)
df_x$amenities <- trim.space(str_to_lower(df_x$amenities))
df_x$amenities <- str_replace_all(df_x$amenities, " ", "_")
# View(df_x$amenities)
  # test data
tdf$amenities <- as.character(tdf$amenities)
tdf$amenities <- trim.space(str_to_lower(tdf$amenities))
tdf$amenities <- str_replace_all(tdf$amenities, " ", "_")

Further, let’s count the number of amenities as well as the most popular amenities -

df_x <- df_x %>%
  mutate(No_of_Amens = str_count(amenities, ",") + 1)

# df_x %>% 
#   mutate(amens_q = ntile(No_of_Amens, 5)) %>%
#   group_by(amens_q, No_of_Amens) %>%
#   summarize(n = n()) %>%
#   group_by(amens_q) %>%
#   summarize(total = sum(n),
#             min_age = min(No_of_Amens),
#             max_age = max(No_of_Amens))

# 1 19999   1     13    
# 2 19998   13  17  
# 3 19998   17  21  
# 4 19998   21  25  
# 5 19998   25  78

amens_avg <- mean(df_x$No_of_Amens)

df_x$No_of_Amens <- replace_na(df_x$No_of_Amens, amens_avg)

df_x <- df_x %>%
  mutate(
    vlowAmens = ifelse((No_of_Amens >= 1) & (No_of_Amens < 13), 1, 0),
    lowAmens = ifelse((No_of_Amens >= 13) & (No_of_Amens < 17), 1, 0),
    medAmens = ifelse((No_of_Amens >= 17) & (No_of_Amens < 21), 1, 0),
    highAmens = ifelse((No_of_Amens >= 21) & (No_of_Amens < 25), 1, 0),
    vhighAmens = ifelse(No_of_Amens > 25, 1, 0)
         )
  # test data
tdf <- tdf %>%
  mutate(No_of_Amens = str_count(amenities, ",") + 1)

tdf <- tdf %>%
  mutate(
    vlowAmens = ifelse((No_of_Amens >= 1) & (No_of_Amens < 13), 1, 0),
    lowAmens = ifelse((No_of_Amens >= 13) & (No_of_Amens < 17), 1, 0),
    medAmens = ifelse((No_of_Amens >= 17) & (No_of_Amens < 21), 1, 0),
    highAmens = ifelse((No_of_Amens >= 21) & (No_of_Amens < 25), 1, 0),
    vhighAmens = ifelse(No_of_Amens > 25, 1, 0)
         )
k <- gsub(",", " ", df_x$amenities)
freq_amenities <- freq_terms(k, 50)
plot(freq_amenities)

More dummies for specific amenities -

kitchen_amens <- "kitchen|microwave|stove"
kitchen2_amens <- "coffee_maker|refrigerator"
heat_amens <- "heating"
ac_amens <- "air_conditioning"
tv_amens <- "tv|cable_tv"
wifi_amens <- "internet|wireless_internet|wifi"
pets_amens <- "pets_allowed|dogs"
luxury_amens <- "gym|hot_tub|pool|indoor_fireplace|breakfast"
petsLiveHere <- "pets_live_on_this_property"
workspace_amens <- "laptop_friendly_workspace"
parking_amens <- "free_parking_on_premises"
extrabedding <- "bed_linens|extra_pillows_and_blankets"
elevator_amens <- "elevator|elevator_in_building"
privacy_amens <- "private_entrance|lock_on_bedroom_door"

df_x[grep(kitchen_amens, df_x$amenities, value = F), "hasKitchen1"] <- 1
df_x[grep(kitchen2_amens, df_x$amenities, value = F), "hasKitchen2"] <- 1
df_x[grep(heat_amens, df_x$amenities, value = F), "hasHeating"] <- 1
df_x[grep(ac_amens, df_x$amenities, value = F), "hasAC"] <- 1
df_x[grep(tv_amens, df_x$amenities, value = F), "hasTV"] <- 1
df_x[grep(wifi_amens, df_x$amenities, value = F), "hasWiFi"] <- 1
df_x[grep(pets_amens, df_x$amenities, value = F), "PetsAllowed"] <- 1
df_x[grep(luxury_amens, df_x$amenities, value = F), "hasLuxuryAmens"] <- 1
df_x[grep(petsLiveHere, df_x$amenities, value = F), "petsLiveHere"] <- 1
df_x[grep(workspace_amens, df_x$amenities, value = F), "hasWorkspace"] <- 1
df_x[grep(parking_amens, df_x$amenities, value = F), "hasFreeParking"] <- 1
df_x[grep(extrabedding, df_x$amenities, value = F), "extraBedding"] <- 1
df_x[grep(elevator_amens, df_x$amenities, value = F), "hasElevator"] <- 1
df_x[grep(privacy_amens, df_x$amenities, value = F), "hasPrivacyAmens"] <- 1
  # test data
tdf[grep(kitchen_amens, tdf$amenities, value = F), "hasKitchen1"] <- 1
tdf[grep(kitchen2_amens, tdf$amenities, value = F), "hasKitchen2"] <- 1
tdf[grep(heat_amens, tdf$amenities, value = F), "hasHeating"] <- 1
tdf[grep(ac_amens, tdf$amenities, value = F), "hasAC"] <- 1
tdf[grep(tv_amens, tdf$amenities, value = F), "hasTV"] <- 1
tdf[grep(wifi_amens, tdf$amenities, value = F), "hasWiFi"] <- 1
tdf[grep(pets_amens, tdf$amenities, value = F), "PetsAllowed"] <- 1
tdf[grep(luxury_amens, tdf$amenities, value = F), "hasLuxuryAmens"] <- 1
tdf[grep(petsLiveHere, tdf$amenities, value = F), "petsLiveHere"] <- 1
tdf[grep(workspace_amens, tdf$amenities, value = F), "hasWorkspace"] <- 1
tdf[grep(parking_amens, tdf$amenities, value = F), "hasFreeParking"] <- 1
tdf[grep(extrabedding, tdf$amenities, value = F), "extraBedding"] <- 1
tdf[grep(elevator_amens, tdf$amenities, value = F), "hasElevator"] <- 1
tdf[grep(privacy_amens, tdf$amenities, value = F), "hasPrivacyAmens"] <- 1

Replace NAs in the newly created columns with 0s

amenCols <- c("hasKitchen1", "hasKitchen2", "hasHeating", "hasAC", 
              "hasTV", "hasWiFi", "PetsAllowed", "hasLuxuryAmens", 
              "petsLiveHere", "hasWorkspace", "hasFreeParking", 
              "extraBedding", "hasElevator", "hasPrivacyAmens")

df_x$hasKitchen1 <- replace_na(df_x$hasKitchen1, 0)
df_x$hasKitchen2 <- replace_na(df_x$hasKitchen2, 0)
df_x$hasHeating <- replace_na(df_x$hasHeating, 0)
df_x$hasAC <- replace_na(df_x$hasAC, 0)
df_x$hasTV <- replace_na(df_x$hasTV, 0)
df_x$hasWiFi <- replace_na(df_x$hasWiFi, 0)
df_x$PetsAllowed <- replace_na(df_x$PetsAllowed, 0)
df_x$hasLuxuryAmens <- replace_na(df_x$hasLuxuryAmens, 0)
df_x$petsLiveHere <- replace_na(df_x$petsLiveHere, 0)
df_x$hasWorkspace <- replace_na(df_x$hasWorkspace, 0)
df_x$hasFreeParking <- replace_na(df_x$hasFreeParking, 0)
df_x$extraBedding <- replace_na(df_x$extraBedding, 0)
df_x$hasElevator <- replace_na(df_x$hasElevator, 0)
df_x$hasPrivacyAmens <- replace_na(df_x$hasPrivacyAmens, 0)
  # test data
tdf$hasKitchen1 <- replace_na(tdf$hasKitchen1, 0)
tdf$hasKitchen2 <- replace_na(tdf$hasKitchen2, 0)
tdf$hasHeating <- replace_na(tdf$hasHeating, 0)
tdf$hasAC <- replace_na(tdf$hasAC, 0)
tdf$hasTV <- replace_na(tdf$hasTV, 0)
tdf$hasWiFi <- replace_na(tdf$hasWiFi, 0)
tdf$PetsAllowed <- replace_na(tdf$PetsAllowed, 0)
tdf$hasLuxuryAmens <- replace_na(tdf$hasLuxuryAmens, 0)
tdf$petsLiveHere <- replace_na(tdf$petsLiveHere, 0)
tdf$hasWorkspace <- replace_na(tdf$hasWorkspace, 0)
tdf$hasFreeParking <- replace_na(tdf$hasFreeParking, 0)
tdf$extraBedding <- replace_na(tdf$extraBedding, 0)
tdf$hasElevator <- replace_na(tdf$hasElevator, 0)
tdf$hasPrivacyAmens <- replace_na(tdf$hasPrivacyAmens, 0)

I’d like to see if smoking is allowed or not from the house rules column -

For that, trim spaces and convert to lowercase first -

df_x$house_rules <- as.character(df_x$house_rules)
df_x$house_rules <- trim.space(str_to_lower(df_x$house_rules))
  # test data
tdf$house_rules <- as.character(tdf$house_rules)
tdf$house_rules <- trim.space(str_to_lower(tdf$house_rules))
df_x[grep("no smoking", df_x$house_rules, value = F), "NoSmoking"] <- 1
df_x$NoSmoking[is.na(df_x$NoSmoking)] <- 0
  # test data
tdf[grep("no smoking", tdf$house_rules, value = F), "NoSmoking"] <- 1
tdf$NoSmoking[is.na(tdf$NoSmoking)] <- 0

Is host from USA?

df_x$host_location <- trim.space(str_to_lower(df_x$host_location))
  # test data
tdf$host_location <- trim.space(str_to_lower(tdf$host_location))

df_x <- df_x %>%
  mutate(
    isHost_in_US = ifelse(grepl("united states", host_location), 1, 0)
  )
  # test data
tdf <- tdf %>%
  mutate(
    isHost_in_US = ifelse(grepl("united states", host_location), 1, 0)
  )

More processing on the numerical columns now -

accommodates availability_30 availability_365 availability_60 availability_90 bathrooms bedrooms beds cleaning_fee extra_people guests_included

host_listings_count host_response_rate host_total_listings_count

maximum_nights minimum_nights

price

# View(table(df_x$guests_included)) There are some 0s
# View(table(df_x$accommodates)) There are no 0s

# Replacing the 0s and NAs in guests_included with 1s, the most popular category

df_x$guests_included[df_x$guests_included == 0] <- 1
df_x$guests_included <- replace_na(df_x$guests_included, 1)
 # test data
tdf$guests_included[tdf$guests_included == 0] <- 1
tdf$guests_included <- replace_na(tdf$guests_included, 1)

# The price can also include the cleaning fee

# summary(df_x$cleaning_fee) - median is 50

df_x$cleaning_fee <- replace_na(df_x$cleaning_fee, 50)
  # test data
tdf$cleaning_fee <- replace_na(tdf$cleaning_fee, 50)

# The security deposit has 40K NAs

# summary(df_x$security_deposit)

df_x$security_deposit <- replace_na(df_x$security_deposit, 200)
  # test data
tdf$security_deposit <- replace_na(tdf$security_deposit, 200)

df_x <- df_x %>%
  mutate(
    bathroomGuestRatio = bathrooms/accommodates,
    bedroomsGuestRatio = bedrooms/accommodates,
    bedsGuestsRatio = beds/accommodates,
    pricePerGuest = (price + cleaning_fee)/guests_included,
    sec_dep_perc = ifelse(price >=1 , (100*security_deposit)/price, 60)
  )
  # test data
tdf <- tdf %>%
  mutate(
    bathroomGuestRatio = bathrooms/accommodates,
    bedroomsGuestRatio = bedrooms/accommodates,
    bedsGuestsRatio = beds/accommodates,
    pricePerGuest = (price + cleaning_fee)/guests_included,
    sec_dep_perc = ifelse(price >=1 , (100*security_deposit)/price, 60)
  )

There are some sec_dep_perc that are greater than 100!

Let’s change those -

df_x$sec_dep_perc[df_x$sec_dep_perc > 100] <- 60
  # test data
tdf$sec_dep_perc[tdf$sec_dep_perc > 100] <- 60

Now, we can bin the security deposits -

df_x <- df_x %>%
  mutate(
    secDep0 = ifelse(sec_dep_perc == 0, 1, 0),
    secDepGen = ifelse((sec_dep_perc > 0) & (sec_dep_perc <= 60), 1, 0),
    secDepHigh = ifelse(sec_dep_perc > 60, 1, 0)
  )
  # test data
tdf <- tdf %>%
  mutate(
    secDep0 = ifelse(sec_dep_perc == 0, 1, 0),
    secDepGen = ifelse((sec_dep_perc > 0) & (sec_dep_perc <= 60), 1, 0),
    secDepHigh = ifelse(sec_dep_perc > 60, 1, 0)
  )

The pricePerGuest can be further divided into dummies based on quantiles -

# df_x %>% 
#   mutate(pPG_q = ntile(pricePerGuest, 5)) %>%
#   group_by(pPG_q, pricePerGuest) %>%
#   summarize(n = n()) %>%
#   group_by(pPG_q) %>%
#   summarize(total = sum(n),
#             min_age = min(pricePerGuest),
#             max_age = max(pricePerGuest))
# 
# summary(df_x$pricePerGuest)

df_x <- df_x %>%
  mutate(
    vlow_pPG = ifelse((pricePerGuest >= 0) & (pricePerGuest < 44), 1, 0),
    low_pPG = ifelse((pricePerGuest >= 44) & (pricePerGuest < 87.5), 1, 0),
    med_pPG = ifelse((pricePerGuest >= 87.5) & (pricePerGuest < 250), 1, 0),
    high_pPG = ifelse((pricePerGuest >= 250) & (pricePerGuest < 1000), 1, 0),
    vHigh_pPG = ifelse(pricePerGuest >= 1000, 1, 0)
  )
  # test data
tdf <- tdf%>%
  mutate(
    vlow_pPG = ifelse((pricePerGuest >= 0) & (pricePerGuest < 44), 1, 0),
    low_pPG = ifelse((pricePerGuest >= 44) & (pricePerGuest < 87.5), 1, 0),
    med_pPG = ifelse((pricePerGuest >= 87.5) & (pricePerGuest < 250), 1, 0),
    high_pPG = ifelse((pricePerGuest >= 250) & (pricePerGuest < 1000), 1, 0),
    vHigh_pPG = ifelse(pricePerGuest >= 1000, 1, 0)
  )

Making dummies of bathroomGuestRatio, bedsGuestRatio, and bedroomsGuestRation -

First, bedsGuestsRatio -

# df_x %>%
#   mutate(bGR_q = ntile(bedsGuestsRatio, 5)) %>%
#   group_by(bGR_q, bedsGuestsRatio) %>%
#   summarize(n = n()) %>%
#   group_by(bGR_q) %>%
#   summarize(total = sum(n),
#             min_age = min(bedsGuestsRatio),
#             max_age = max(bedsGuestsRatio))
# 
# summary(df_x$bedsGuestsRatio)

df_x$bedsGuestsRatio <- replace_na(df_x$bedsGuestsRatio, 1)
  # test data
tdf$bedsGuestsRatio <- replace_na(tdf$bedsGuestsRatio, 1)

df_x <- df_x %>%
  mutate(
    beds_Comfy = ifelse((bedsGuestsRatio >= 0.7333333) & (bedsGuestsRatio < 1.3), 1, 0),
    beds_Cramped = ifelse(bedsGuestsRatio < 0.7333333, 1, 0),
    beds_Excess = ifelse(bedsGuestsRatio > 1.3, 1, 0)
  )
  # test data
tdf <- tdf %>%
  mutate(
    beds_Comfy = ifelse((bedsGuestsRatio >= 0.7333333) & (bedsGuestsRatio < 1.3), 1, 0),
    beds_Cramped = ifelse(bedsGuestsRatio < 0.7333333, 1, 0),
    beds_Excess = ifelse(bedsGuestsRatio > 1.3, 1, 0)
  )

Second, bathroomGuestRatio

# df_x %>%
#   mutate(baGR_q = ntile(bathroomGuestRatio, 5)) %>%
#   group_by(baGR_q, bathroomGuestRatio) %>%
#   summarize(n = n()) %>%
#   group_by(baGR_q) %>%
#   summarize(total = sum(n),
#             min_age = min(bathroomGuestRatio),
#             max_age = max(bathroomGuestRatio))
# 
# summary(df_x$bathroomGuestRatio)

df_x$bathroomGuestRatio <- replace_na(df_x$bedsGuestsRatio, 0.7)
  # test data
tdf$bathroomGuestRatio <- replace_na(tdf$bathroomGuestRatio, 0.7)

df_x <- df_x %>%
  mutate(
    baths_Comfy = ifelse((bedsGuestsRatio >= 0.5) & (bedsGuestsRatio < 1.2), 1, 0),
    baths_Cramped = ifelse(bedsGuestsRatio < 0.5, 1, 0),
    baths_Excess = ifelse(bedsGuestsRatio > 1.2, 1, 0)
  )
  # test data
tdf <- tdf %>%
  mutate(
    baths_Comfy = ifelse((bedsGuestsRatio >= 0.5) & (bedsGuestsRatio < 1.2), 1, 0),
    baths_Cramped = ifelse(bedsGuestsRatio < 0.5, 1, 0),
    baths_Excess = ifelse(bedsGuestsRatio > 1.2, 1, 0)
  )

Finally, bedroomsGuestRation

# summary(df_x$bedroomsGuestRatio)

df_x$bedroomsGuestRatio <- replace_na(df_x$bedroomsGuestRatio, 0.5)
  # test data
tdf$bedroomsGuestRatio <- replace_na(tdf$bedroomsGuestRatio, 0.5)

df_x <- df_x %>%
  mutate(
    bedrooms_Comfy = ifelse((bedroomsGuestRatio >= 0.5) & (bedroomsGuestRatio < 1.2), 1, 0),
    bedrooms_Cramped = ifelse(bedroomsGuestRatio < 0.5, 1, 0),
    bedrooms_Excess = ifelse(bedroomsGuestRatio > 1.2, 1, 0)
  )
  # test data
tdf <- tdf %>%
  mutate(
    bedrooms_Comfy = ifelse((bedroomsGuestRatio >= 0.5) & (bedroomsGuestRatio < 1.2), 1, 0),
    bedrooms_Cramped = ifelse(bedroomsGuestRatio < 0.5, 1, 0),
    bedrooms_Excess = ifelse(bedroomsGuestRatio > 1.2, 1, 0)
  )

There are columns with text in them -

  1. df$access - Something like ‘I like to feel right at home’
  2. df$amenities - {TV, "Cable TV"}
  3. df$description - “Vacation in this Nashville home featuring a premier location”
  4. df$host_about - “I am from Virginia, USA and my wife is from Istanbul, Turkey”
  5. df$host_location - “Nashville, Tennessee, United States”, “Los Angeles, California, United States”
  6. df$host_name - May & Eric, Jonathon, Jeremy, Gene, Holly, Nathan, Gregg, Alicia, Laura, Raymond
  7. df$host_neighbourhood - , Silver Lake, East Village, Near Northeast/H Street Corridor, Bayou St. John
  8. df$host_verifications - “[‘email’, ‘phone’, ‘google’, ‘reviews’]”, “[‘email’, ‘phone’, ‘reviews’, ‘jumio’]”
  9. df$house_rules - “No pets, parties, or indoor smoking. Feel free to use everything in kitchen bu”
  10. df$interaction - “Don’t hesitate to call or text any time throughout your stay! Where you are looking fo”
  11. df$jurisdiction_names - “City of Los Angeles, CA”, “SAN DIEGO, SAN DIEGO TOURISM MARKETING DISTRICT A, SAN”
  12. df$license - “17STR-04969”, “1075925”, “17STR-14190”
  13. df$name - “BRAND NEW ! NASHVILLE LOFT”, “Silverlake Hills Home with View”
  14. df$neighborhood_overview - “This modern loft is locate in a nice neighborhood near to the centennial park and just”
  15. df$neighbourhood - “Silver Lake, Pacific Beach, Near Northeast/H Street Corridor, Bayou St. John, Lower H”
  16. df$notes - “Its a fun beach town - the beach, the bay, the great restaurants, the wonderfu”
  17. df$space - “Beautiful brand new loft in a central location. Near Vandy and Centennial park,only 2”
  18. df$street - “Nashville, TN, United States”, “Silver Lake, Los Angeles, CA 90026, United States”
  19. df$summary - “Vacation in this Nashville home featuring a premier location - just 2 miles from downt”
  20. df$transit - “Ubers are everywhere. We also have car2go’s everywhere if you have an account”

Let’s treat the numerical columns which we are not going to be binning -

Scale availabilities and accommodates -

# summary(df_x$availability_30)
# summary(df_x$availability_60)
# summary(df_x$availability_90)
avail_vars <- df_x %>%
  select(availability_30, availability_60, availability_90, availability_365, accommodates)

processed_vars <- preProcess(avail_vars, method = c("YeoJohnson"))

df_x <- predict(processed_vars, df_x)
  # test data
tdf <- predict(preProcess(tdf %>% select(availability_30, availability_60, availability_90, availability_365, accommodates), method = c("YeoJohnson")), tdf)

Let’s take a look at cleaning fee and replace NAs with the median

qplot(df_x$cleaning_fee)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(df_x$security_deposit)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_x$cleaning_fee <- replace_na(df_x$cleaning_fee, 50)

summary(df_x$cleaning_fee)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   30.00   50.00   63.88   80.00 1000.00

Create standardized columns for price, cleaning_fee, security_deposit, maximum_nights

processed_vars <- preProcess(df_x %>% select(price, extra_people, maximum_nights), method = "YeoJohnson")
df_x <- predict(processed_vars, df_x)
  # test data
tdf <- predict(preProcess(tdf %>% select(price, extra_people, maximum_nights), method = "YeoJohnson"), tdf)

Minimum Nights dummies

# summary(df_x$minimum_nights)

# View(table(df_x$minimum_nights))

df_x$minimum_nights[df_x$minimum_nights > 1250] <- 100 # random
# summary(df_x$minimum_nights)

df_x <- df_x %>%
  mutate(
    minNigh1 = ifelse(minimum_nights == 1, 1, 0),
    minNigh2 = ifelse(minimum_nights == 2, 1, 0),
    minNigh3 = ifelse(minimum_nights == 3, 1, 0),
    minNight_Excess = ifelse(minimum_nights > 3, 1, 0)
  )
  # test data
tdf <- tdf %>%
  mutate(
    minNigh1 = ifelse(minimum_nights == 1, 1, 0),
    minNigh2 = ifelse(minimum_nights == 2, 1, 0),
    minNigh3 = ifelse(minimum_nights == 3, 1, 0),
    minNight_Excess = ifelse(minimum_nights > 3, 1, 0)
  )

Dummies for presence of cleaning fee and security deposit -

(Commented because combined cleaning_fee with price and created dummies for security_deposit)

# df_x <- df_x %>%
#   mutate(
#     isCleaningFee = ifelse(is.na(cleaning_fee) | cleaning_fee == 0, 0, 1),
#     isSecurityDeposit = ifelse(is.na(security_deposit) | security_deposit == 0, 0, 1)
#   )
#   # test data
# tdf <- tdf %>%
#   mutate(
#     isCleaningFee = ifelse(is.na(cleaning_fee) | cleaning_fee == 0, 0, 1),
#     isSecurityDeposit = ifelse(is.na(security_deposit) | security_deposit == 0, 0, 1)
#   )

host_response_rate, host_listings_count, host_total_listings_count

# qplot(df_x_main$host_listings_count, df_x_main$host_total_listings_count)

# Both are same

df_x$host_response_rate <- replace_na(df_x$host_response_rate, 100)
  # test data
tdf$host_response_rate <- replace_na(tdf$host_response_rate, 100)

# Scale

df_x <- predict(preProcess(df_x %>% select(host_listings_count, host_response_rate), method = c("YeoJohnson")), df_x)
  # test data
tdf <- predict(preProcess(tdf %>% select(host_listings_count, host_response_rate), method = c("YeoJohnson")), tdf)